From e19a8ffa98f806cf19a3304dfc66dec7098a5c04 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:21:36 +0200
Subject: [PATCH 01/16] Added TODOs to the new file structure for stacking
 (clean)

---
 autoPyTorch/ensemble/ensemble_builder.py      | 338 ----------
 .../ensemble/ensemble_builder_manager.py      | 358 ++++++++++
 autoPyTorch/ensemble/stacking_ensemble.py     | 292 +++++++++
 .../ensemble/stacking_ensemble_builder.py     | 619 ++++++++++++++++++
 autoPyTorch/evaluation/stacking_evaluator.py  | 359 ++++++++++
 autoPyTorch/optimizer/smbo.py                 |   2 +-
 test/test_ensemble/ensemble_utils.py          |   2 +-
 test/test_ensemble/test_ensemble.py           |   2 +-
 8 files changed, 1631 insertions(+), 341 deletions(-)
 create mode 100644 autoPyTorch/ensemble/ensemble_builder_manager.py
 create mode 100644 autoPyTorch/ensemble/stacking_ensemble.py
 create mode 100644 autoPyTorch/ensemble/stacking_ensemble_builder.py
 create mode 100644 autoPyTorch/evaluation/stacking_evaluator.py

diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index 35a281235..311069e50 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -1,4 +1,3 @@
-# -*- encoding: utf-8 -*-
 import glob
 import gzip
 import logging
@@ -15,8 +14,6 @@
 import zlib
 from typing import Dict, List, Optional, Set, Tuple, Union
 
-import dask.distributed
-
 import numpy as np
 
 import pandas as pd
@@ -25,9 +22,6 @@
 
 from sklearn.utils.validation import check_random_state
 
-from smac.callbacks import IncorporateRunResultCallback
-from smac.optimizer.smbo import SMBO
-from smac.runhistory.runhistory import RunInfo, RunValue
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
@@ -44,338 +38,6 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
-class EnsembleBuilderManager(IncorporateRunResultCallback):
-    def __init__(
-        self,
-        start_time: float,
-        time_left_for_ensembles: float,
-        backend: Backend,
-        dataset_name: str,
-        task_type: int,
-        output_type: int,
-        metrics: List[autoPyTorchMetric],
-        opt_metric: str,
-        ensemble_size: int,
-        ensemble_nbest: int,
-        max_models_on_disc: Union[float, int],
-        seed: int,
-        precision: int,
-        max_iterations: Optional[int],
-        read_at_most: int,
-        ensemble_memory_limit: Optional[int],
-        random_state: int,
-        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-        pynisher_context: str = 'fork',
-    ):
-        """ SMAC callback to handle ensemble building
-        Args:
-            start_time: int
-                the time when this job was started, to account for any latency in job allocation
-            time_left_for_ensemble: int
-                How much time is left for the task. Job should finish within this allocated time
-            backend: util.backend.Backend
-                backend to write and read files
-            dataset_name: str
-                name of dataset
-            task_type: int
-                what type of output is expected. If Binary, we need to argmax the one hot encoding.
-            metrics: List[autoPyTorchMetric],
-                A set of metrics that will be used to get performance estimates
-            opt_metric: str
-                name of the optimization metrics
-            ensemble_size: int
-                maximal size of ensemble (passed to ensemble_selection)
-            ensemble_nbest: int/float
-                if int: consider only the n best prediction
-                if float: consider only this fraction of the best models
-                Both wrt to validation predictions
-                If performance_range_threshold > 0, might return less models
-            max_models_on_disc: Union[float, int]
-                Defines the maximum number of models that are kept in the disc.
-                If int, it must be greater or equal than 1, and dictates the max number of
-                models to keep.
-                If float, it will be interpreted as the max megabytes allowed of disc space. That
-                is, if the number of ensemble candidates require more disc space than this float
-                value, the worst models will be deleted to keep within this budget.
-                Models and predictions of the worst-performing models will be deleted then.
-                If None, the feature is disabled.
-                It defines an upper bound on the models that can be used in the ensemble.
-            seed: int
-                random seed
-            max_iterations: int
-                maximal number of iterations to run this script
-                (default None --> deactivated)
-            precision (int): [16,32,64,128]
-                precision of floats to read the predictions
-            memory_limit: Optional[int]
-                memory limit in mb. If ``None``, no memory limit is enforced.
-            read_at_most: int
-                read at most n new prediction files in each iteration
-            logger_port: int
-                port in where to publish a msg
-            pynisher_context: str
-                The multiprocessing context for pynisher. One of spawn/fork/forkserver.
-
-        Returns:
-            List[Tuple[int, float, float, float]]:
-                A list with the performance history of this ensemble, of the form
-                [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
-        """
-        self.start_time = start_time
-        self.time_left_for_ensembles = time_left_for_ensembles
-        self.backend = backend
-        self.dataset_name = dataset_name
-        self.task_type = task_type
-        self.output_type = output_type
-        self.metrics = metrics
-        self.opt_metric = opt_metric
-        self.ensemble_size = ensemble_size
-        self.ensemble_nbest = ensemble_nbest
-        self.max_models_on_disc: Union[float, int] = max_models_on_disc
-        self.seed = seed
-        self.precision = precision
-        self.max_iterations = max_iterations
-        self.read_at_most = read_at_most
-        self.ensemble_memory_limit = ensemble_memory_limit
-        self.random_state = random_state
-        self.logger_port = logger_port
-        self.pynisher_context = pynisher_context
-
-        # Store something similar to SMAC's runhistory
-        self.history: List[Dict[str, float]] = []
-
-        # We only submit new ensembles when there is not an active ensemble job
-        self.futures: List[dask.Future] = []
-
-        # The last criteria is the number of iterations
-        self.iteration = 0
-
-        # Keep track of when we started to know when we need to finish!
-        self.start_time = time.time()
-
-    def __call__(
-        self,
-        smbo: 'SMBO',
-        run_info: RunInfo,
-        result: RunValue,
-        time_left: float,
-    ) -> None:
-        self.build_ensemble(smbo.tae_runner.client)
-
-    def build_ensemble(
-        self,
-        dask_client: dask.distributed.Client,
-        unit_test: bool = False
-    ) -> None:
-
-        # The second criteria is elapsed time
-        elapsed_time = time.time() - self.start_time
-
-        logger = get_named_client_logger(
-            name='EnsembleBuilder',
-            port=self.logger_port,
-        )
-
-        # First test for termination conditions
-        if self.time_left_for_ensembles < elapsed_time:
-            logger.info(
-                "Terminate ensemble building as not time is left (run for {}s)".format(
-                    elapsed_time
-                ),
-            )
-            return
-        if self.max_iterations is not None and self.max_iterations <= self.iteration:
-            logger.info(
-                "Terminate ensemble building because of max iterations: {} of {}".format(
-                    self.max_iterations,
-                    self.iteration
-                )
-            )
-            return
-
-        if len(self.futures) != 0:
-            if self.futures[0].done():
-                result = self.futures.pop().result()
-                if result:
-                    ensemble_history, self.ensemble_nbest, _, _ = result
-                    logger.debug("iteration={} @ elapsed_time={} has history={}".format(
-                        self.iteration,
-                        elapsed_time,
-                        ensemble_history,
-                    ))
-                    self.history.extend(ensemble_history)
-
-        # Only submit new jobs if the previous ensemble job finished
-        if len(self.futures) == 0:
-
-            # Add the result of the run
-            # On the next while iteration, no references to
-            # ensemble builder object, so it should be garbage collected to
-            # save memory while waiting for resources
-            # Also, notice how ensemble nbest is returned, so we don't waste
-            # iterations testing if the deterministic predictions size can
-            # be fitted in memory
-            try:
-                # Submit a Dask job from this job, to properly
-                # see it in the dask diagnostic dashboard
-                # Notice that the forked ensemble_builder_process will
-                # wait for the below function to be done
-                self.futures.append(dask_client.submit(
-                    fit_and_return_ensemble,
-                    backend=self.backend,
-                    dataset_name=self.dataset_name,
-                    task_type=self.task_type,
-                    output_type=self.output_type,
-                    metrics=self.metrics,
-                    opt_metric=self.opt_metric,
-                    ensemble_size=self.ensemble_size,
-                    ensemble_nbest=self.ensemble_nbest,
-                    max_models_on_disc=self.max_models_on_disc,
-                    seed=self.seed,
-                    precision=self.precision,
-                    memory_limit=self.ensemble_memory_limit,
-                    read_at_most=self.read_at_most,
-                    random_state=self.seed,
-                    end_at=self.start_time + self.time_left_for_ensembles,
-                    iteration=self.iteration,
-                    return_predictions=False,
-                    priority=100,
-                    pynisher_context=self.pynisher_context,
-                    logger_port=self.logger_port,
-                    unit_test=unit_test,
-                ))
-
-                logger.info(
-                    "{}/{} Started Ensemble builder job at {} for iteration {}.".format(
-                        # Log the client to make sure we
-                        # remain connected to the scheduler
-                        self.futures[0],
-                        dask_client,
-                        time.strftime("%Y.%m.%d-%H.%M.%S"),
-                        self.iteration,
-                    ),
-                )
-                self.iteration += 1
-            except Exception as e:
-                exception_traceback = traceback.format_exc()
-                error_message = repr(e)
-                logger.critical(exception_traceback)
-                logger.critical(error_message)
-
-
-def fit_and_return_ensemble(
-    backend: Backend,
-    dataset_name: str,
-    task_type: int,
-    output_type: int,
-    metrics: List[autoPyTorchMetric],
-    opt_metric: str,
-    ensemble_size: int,
-    ensemble_nbest: int,
-    max_models_on_disc: Union[float, int],
-    seed: int,
-    precision: int,
-    memory_limit: Optional[int],
-    read_at_most: int,
-    random_state: int,
-    end_at: float,
-    iteration: int,
-    return_predictions: bool,
-    pynisher_context: str,
-    logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-    unit_test: bool = False,
-) -> Tuple[
-        List[Dict[str, float]],
-        int,
-        Optional[np.ndarray],
-        Optional[np.ndarray],
-]:
-    """
-    A short function to fit and create an ensemble. It is just a wrapper to easily send
-    a request to dask to create an ensemble and clean the memory when finished
-    Parameters
-    ----------
-        backend: util.backend.Backend
-            backend to write and read files
-        dataset_name: str
-            name of dataset
-        metrics: List[autoPyTorchMetric],
-            A set of metrics that will be used to get performance estimates
-        opt_metric:
-            Name of the metric to optimize
-        task_type: int
-            type of output expected in the ground truth
-        ensemble_size: int
-            maximal size of ensemble (passed to ensemble.ensemble_selection)
-        ensemble_nbest: int/float
-            if int: consider only the n best prediction
-            if float: consider only this fraction of the best models
-            Both wrt to validation predictions
-            If performance_range_threshold > 0, might return less models
-        max_models_on_disc: int
-           Defines the maximum number of models that are kept in the disc.
-           If int, it must be greater or equal than 1, and dictates the max number of
-           models to keep.
-           If float, it will be interpreted as the max megabytes allowed of disc space. That
-           is, if the number of ensemble candidates require more disc space than this float
-           value, the worst models will be deleted to keep within this budget.
-           Models and predictions of the worst-performing models will be deleted then.
-           If None, the feature is disabled.
-           It defines an upper bound on the models that can be used in the ensemble.
-        seed: int
-            random seed
-        precision (int): [16,32,64,128]
-            precision of floats to read the predictions
-        memory_limit: Optional[int]
-            memory limit in mb. If ``None``, no memory limit is enforced.
-        read_at_most: int
-            read at most n new prediction files in each iteration
-        end_at: float
-            At what time the job must finish. Needs to be the endtime and not the time left
-            because we do not know when dask schedules the job.
-        iteration: int
-            The current iteration
-        pynisher_context: str
-            Context to use for multiprocessing, can be either fork, spawn or forkserver.
-        logger_port: int
-            The port where the logging server is listening to.
-        unit_test: bool
-            Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
-            Having this is very bad coding style, but I did not find a way to make
-            unittest.mock work through the pynisher with all spawn contexts. If you know a
-            better solution, please let us know by opening an issue.
-    Returns
-    -------
-        List[Tuple[int, float, float, float]]
-            A list with the performance history of this ensemble, of the form
-            [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
-    """
-    result = EnsembleBuilder(
-        backend=backend,
-        dataset_name=dataset_name,
-        task_type=task_type,
-        output_type=output_type,
-        metrics=metrics,
-        opt_metric=opt_metric,
-        ensemble_size=ensemble_size,
-        ensemble_nbest=ensemble_nbest,
-        max_models_on_disc=max_models_on_disc,
-        seed=seed,
-        precision=precision,
-        memory_limit=memory_limit,
-        read_at_most=read_at_most,
-        random_state=random_state,
-        logger_port=logger_port,
-        unit_test=unit_test,
-    ).run(
-        end_at=end_at,
-        iteration=iteration,
-        return_predictions=return_predictions,
-        pynisher_context=pynisher_context,
-    )
-    return result
-
-
 class EnsembleBuilder(object):
     def __init__(
         self,
diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py
new file mode 100644
index 000000000..845992064
--- /dev/null
+++ b/autoPyTorch/ensemble/ensemble_builder_manager.py
@@ -0,0 +1,358 @@
+# -*- encoding: utf-8 -*-
+import logging
+import logging.handlers
+import time
+import traceback
+from typing import Dict, List, Optional, Tuple, Union
+
+import dask.distributed
+
+import numpy as np
+
+import pandas as pd
+
+
+
+from smac.callbacks import IncorporateRunResultCallback
+from smac.optimizer.smbo import SMBO
+from smac.runhistory.runhistory import RunInfo, RunValue
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import BINARY
+from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.logging_ import get_named_client_logger
+
+
+class EnsembleBuilderManager(IncorporateRunResultCallback):
+    def __init__(
+        self,
+        start_time: float,
+        time_left_for_ensembles: float,
+        backend: Backend,
+        dataset_name: str,
+        task_type: int,
+        output_type: int,
+        metrics: List[autoPyTorchMetric],
+        opt_metric: str,
+        ensemble_size: int,
+        ensemble_nbest: int,
+        max_models_on_disc: Union[float, int],
+        seed: int,
+        precision: int,
+        max_iterations: Optional[int],
+        read_at_most: int,
+        ensemble_memory_limit: Optional[int],
+        random_state: int,
+        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+        pynisher_context: str = 'fork',
+    ):
+        """ SMAC callback to handle ensemble building
+        Args:
+            start_time: int
+                the time when this job was started, to account for any latency in job allocation
+            time_left_for_ensemble: int
+                How much time is left for the task. Job should finish within this allocated time
+            backend: util.backend.Backend
+                backend to write and read files
+            dataset_name: str
+                name of dataset
+            task_type: int
+                what type of output is expected. If Binary, we need to argmax the one hot encoding.
+            metrics: List[autoPyTorchMetric],
+                A set of metrics that will be used to get performance estimates
+            opt_metric: str
+                name of the optimization metrics
+            ensemble_size: int
+                maximal size of ensemble (passed to ensemble_selection)
+            ensemble_nbest: int/float
+                if int: consider only the n best prediction
+                if float: consider only this fraction of the best models
+                Both wrt to validation predictions
+                If performance_range_threshold > 0, might return less models
+            max_models_on_disc: Union[float, int]
+                Defines the maximum number of models that are kept in the disc.
+                If int, it must be greater or equal than 1, and dictates the max number of
+                models to keep.
+                If float, it will be interpreted as the max megabytes allowed of disc space. That
+                is, if the number of ensemble candidates require more disc space than this float
+                value, the worst models will be deleted to keep within this budget.
+                Models and predictions of the worst-performing models will be deleted then.
+                If None, the feature is disabled.
+                It defines an upper bound on the models that can be used in the ensemble.
+            seed: int
+                random seed
+            max_iterations: int
+                maximal number of iterations to run this script
+                (default None --> deactivated)
+            precision (int): [16,32,64,128]
+                precision of floats to read the predictions
+            memory_limit: Optional[int]
+                memory limit in mb. If ``None``, no memory limit is enforced.
+            read_at_most: int
+                read at most n new prediction files in each iteration
+            logger_port: int
+                port in where to publish a msg
+            pynisher_context: str
+                The multiprocessing context for pynisher. One of spawn/fork/forkserver.
+
+        Returns:
+            List[Tuple[int, float, float, float]]:
+                A list with the performance history of this ensemble, of the form
+                [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
+        """
+        self.start_time = start_time
+        self.time_left_for_ensembles = time_left_for_ensembles
+        self.backend = backend
+        self.dataset_name = dataset_name
+        self.task_type = task_type
+        self.output_type = output_type
+        self.metrics = metrics
+        self.opt_metric = opt_metric
+        self.ensemble_size = ensemble_size
+        self.ensemble_nbest = ensemble_nbest
+        self.max_models_on_disc: Union[float, int] = max_models_on_disc
+        self.seed = seed
+        self.precision = precision
+        self.max_iterations = max_iterations
+        self.read_at_most = read_at_most
+        self.ensemble_memory_limit = ensemble_memory_limit
+        self.random_state = random_state
+        self.logger_port = logger_port
+        self.pynisher_context = pynisher_context
+
+        # Store something similar to SMAC's runhistory
+        self.history: List[Dict[str, float]] = []
+
+        # We only submit new ensembles when there is not an active ensemble job
+        self.futures: List[dask.Future] = []
+
+        # The last criteria is the number of iterations
+        self.iteration = 0
+
+        # Keep track of when we started to know when we need to finish!
+        self.start_time = time.time()
+
+    def __call__(
+        self,
+        smbo: 'SMBO',
+        run_info: RunInfo,
+        result: RunValue,
+        time_left: float,
+    ) -> None:
+        self.build_ensemble(smbo.tae_runner.client)
+
+    def build_ensemble(
+        self,
+        dask_client: dask.distributed.Client,
+        unit_test: bool = False
+    ) -> None:
+
+        # The second criteria is elapsed time
+        elapsed_time = time.time() - self.start_time
+
+        logger = get_named_client_logger(
+            name='EnsembleBuilder',
+            port=self.logger_port,
+        )
+
+        # First test for termination conditions
+        if self.time_left_for_ensembles < elapsed_time:
+            logger.info(
+                "Terminate ensemble building as not time is left (run for {}s)".format(
+                    elapsed_time
+                ),
+            )
+            return
+        if self.max_iterations is not None and self.max_iterations <= self.iteration:
+            logger.info(
+                "Terminate ensemble building because of max iterations: {} of {}".format(
+                    self.max_iterations,
+                    self.iteration
+                )
+            )
+            return
+
+        if len(self.futures) != 0:
+            if self.futures[0].done():
+                result = self.futures.pop().result()
+                if result:
+                    ensemble_history, self.ensemble_nbest, _, _ = result
+                    logger.debug("iteration={} @ elapsed_time={} has history={}".format(
+                        self.iteration,
+                        elapsed_time,
+                        ensemble_history,
+                    ))
+                    self.history.extend(ensemble_history)
+
+        # Only submit new jobs if the previous ensemble job finished
+        if len(self.futures) == 0:
+
+            # Add the result of the run
+            # On the next while iteration, no references to
+            # ensemble builder object, so it should be garbage collected to
+            # save memory while waiting for resources
+            # Also, notice how ensemble nbest is returned, so we don't waste
+            # iterations testing if the deterministic predictions size can
+            # be fitted in memory
+            try:
+                # Submit a Dask job from this job, to properly
+                # see it in the dask diagnostic dashboard
+                # Notice that the forked ensemble_builder_process will
+                # wait for the below function to be done
+                self.futures.append(dask_client.submit(
+                    fit_and_return_ensemble,
+                    backend=self.backend,
+                    dataset_name=self.dataset_name,
+                    task_type=self.task_type,
+                    output_type=self.output_type,
+                    metrics=self.metrics,
+                    opt_metric=self.opt_metric,
+                    ensemble_size=self.ensemble_size,
+                    ensemble_nbest=self.ensemble_nbest,
+                    max_models_on_disc=self.max_models_on_disc,
+                    seed=self.seed,
+                    precision=self.precision,
+                    memory_limit=self.ensemble_memory_limit,
+                    read_at_most=self.read_at_most,
+                    random_state=self.seed,
+                    end_at=self.start_time + self.time_left_for_ensembles,
+                    iteration=self.iteration,
+                    return_predictions=False,
+                    priority=100,
+                    pynisher_context=self.pynisher_context,
+                    logger_port=self.logger_port,
+                    unit_test=unit_test,
+                ))
+
+                logger.info(
+                    "{}/{} Started Ensemble builder job at {} for iteration {}.".format(
+                        # Log the client to make sure we
+                        # remain connected to the scheduler
+                        self.futures[0],
+                        dask_client,
+                        time.strftime("%Y.%m.%d-%H.%M.%S"),
+                        self.iteration,
+                    ),
+                )
+                self.iteration += 1
+            except Exception as e:
+                exception_traceback = traceback.format_exc()
+                error_message = repr(e)
+                logger.critical(exception_traceback)
+                logger.critical(error_message)
+
+
+def fit_and_return_ensemble(
+    backend: Backend,
+    dataset_name: str,
+    task_type: int,
+    output_type: int,
+    metrics: List[autoPyTorchMetric],
+    opt_metric: str,
+    ensemble_size: int,
+    ensemble_nbest: int,
+    max_models_on_disc: Union[float, int],
+    seed: int,
+    precision: int,
+    memory_limit: Optional[int],
+    read_at_most: int,
+    random_state: int,
+    end_at: float,
+    iteration: int,
+    return_predictions: bool,
+    pynisher_context: str,
+    logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+    unit_test: bool = False,
+) -> Tuple[
+        List[Dict[str, float]],
+        int,
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+]:
+    """
+    A short function to fit and create an ensemble. It is just a wrapper to easily send
+    a request to dask to create an ensemble and clean the memory when finished
+    Parameters
+    ----------
+        backend: util.backend.Backend
+            backend to write and read files
+        dataset_name: str
+            name of dataset
+        metrics: List[autoPyTorchMetric],
+            A set of metrics that will be used to get performance estimates
+        opt_metric:
+            Name of the metric to optimize
+        task_type: int
+            type of output expected in the ground truth
+        ensemble_size: int
+            maximal size of ensemble (passed to ensemble.ensemble_selection)
+        ensemble_nbest: int/float
+            if int: consider only the n best prediction
+            if float: consider only this fraction of the best models
+            Both wrt to validation predictions
+            If performance_range_threshold > 0, might return less models
+        max_models_on_disc: int
+           Defines the maximum number of models that are kept in the disc.
+           If int, it must be greater or equal than 1, and dictates the max number of
+           models to keep.
+           If float, it will be interpreted as the max megabytes allowed of disc space. That
+           is, if the number of ensemble candidates require more disc space than this float
+           value, the worst models will be deleted to keep within this budget.
+           Models and predictions of the worst-performing models will be deleted then.
+           If None, the feature is disabled.
+           It defines an upper bound on the models that can be used in the ensemble.
+        seed: int
+            random seed
+        precision (int): [16,32,64,128]
+            precision of floats to read the predictions
+        memory_limit: Optional[int]
+            memory limit in mb. If ``None``, no memory limit is enforced.
+        read_at_most: int
+            read at most n new prediction files in each iteration
+        end_at: float
+            At what time the job must finish. Needs to be the endtime and not the time left
+            because we do not know when dask schedules the job.
+        iteration: int
+            The current iteration
+        pynisher_context: str
+            Context to use for multiprocessing, can be either fork, spawn or forkserver.
+        logger_port: int
+            The port where the logging server is listening to.
+        unit_test: bool
+            Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
+            Having this is very bad coding style, but I did not find a way to make
+            unittest.mock work through the pynisher with all spawn contexts. If you know a
+            better solution, please let us know by opening an issue.
+    Returns
+    -------
+        List[Tuple[int, float, float, float]]
+            A list with the performance history of this ensemble, of the form
+            [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
+    """
+    result = EnsembleBuilder(
+        backend=backend,
+        dataset_name=dataset_name,
+        task_type=task_type,
+        output_type=output_type,
+        metrics=metrics,
+        opt_metric=opt_metric,
+        ensemble_size=ensemble_size,
+        ensemble_nbest=ensemble_nbest,
+        max_models_on_disc=max_models_on_disc,
+        seed=seed,
+        precision=precision,
+        memory_limit=memory_limit,
+        read_at_most=read_at_most,
+        random_state=random_state,
+        logger_port=logger_port,
+        unit_test=unit_test,
+    ).run(
+        end_at=end_at,
+        iteration=iteration,
+        return_predictions=return_predictions,
+        pynisher_context=pynisher_context,
+    )
+    return result
+
+
diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
new file mode 100644
index 000000000..425d2d8ba
--- /dev/null
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -0,0 +1,292 @@
+from collections import Counter
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+
+
+# TODO: for now we can use this and pass this to stacking evaluator.
+# TODO: This can be achieved by using `backend.load_ensemble`
+# TODO: it loads the last stored ensemble. So we have access to it.
+# TODO: the ensemble is a pickled file containing the fitted ensemble of this class.
+# TODO: Think of functionality of the functions in this class adjusted for stacking.
+class StackingEnsemble(AbstractEnsemble):
+    def __init__(
+        self,
+        ensemble_size: int,
+        metric: autoPyTorchMetric,
+        task_type: int,
+        random_state: np.random.RandomState,
+    ) -> None:
+        self.ensemble_size = ensemble_size
+        self.metric = metric
+        self.random_state = random_state
+        self.task_type = task_type
+
+    def __getstate__(self) -> Dict[str, Any]:
+        # Cannot serialize a metric if
+        # it is user defined.
+        # That is, if doing pickle dump
+        # the metric won't be the same as the
+        # one in __main__. we don't use the metric
+        # in the EnsembleSelection so this should
+        # be fine
+        self.metric = None  # type: ignore
+        return self.__dict__
+
+    def fit(
+        self,
+        predictions: List[np.ndarray],
+        labels: np.ndarray,
+        identifiers: List[Tuple[int, int, float]],
+    ) -> AbstractEnsemble:
+        """
+        Builds a ensemble given the individual models out of fold predictions.
+        Fundamentally, defines a set of weights on how to perform a soft-voting
+        aggregation of the models in the given identifiers.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of individual model predictions of shape (n_datapoints, n_targets)
+                corresponding to the OutOfFold estimate of the ground truth
+            labels (np.ndarray):
+                The ground truth targets of shape (n_datapoints, n_targets)
+            identifiers: List[Tuple[int, int, float]]
+                A list of model identifiers, each with the form
+                (seed, number of run, budget)
+
+        Returns:
+            A copy of self
+        """
+        self.ensemble_size = int(self.ensemble_size)
+        if self.ensemble_size < 1:
+            raise ValueError('Ensemble size cannot be less than one!')
+
+        self._fit(predictions, labels)
+        self._calculate_weights()
+        self.identifiers_ = identifiers
+        return self
+
+    # TODO: fit a stacked ensemble.
+    def _fit(
+        self,
+        predictions: List[np.ndarray],
+        labels: np.ndarray,
+    ) -> None:
+        """
+        Fast version of Rich Caruana's ensemble selection method.
+
+        For more details, please check the paper
+        "Ensemble Selection from Library of Models" by R Caruana  (2004)
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of individual model predictions of shape (n_datapoints, n_targets)
+                corresponding to the OutOfFold estimate of the ground truth
+            identifiers (List[Tuple[int, int, float]]):
+                A list of model identifiers, each with the form
+                (seed, number of run, budget)
+        """
+        self.num_input_models_ = len(predictions)
+
+        ensemble: List[np.ndarray] = []
+        trajectory = []
+        order = []
+
+        ensemble_size = self.ensemble_size
+
+        weighted_ensemble_prediction = np.zeros(
+            predictions[0].shape,
+            dtype=np.float64,
+        )
+        fant_ensemble_prediction = np.zeros(
+            weighted_ensemble_prediction.shape,
+            dtype=np.float64,
+        )
+        for i in range(ensemble_size):
+            losses = np.zeros(
+                (len(predictions)),
+                dtype=np.float64,
+            )
+            s = len(ensemble)
+            if s > 0:
+                np.add(
+                    weighted_ensemble_prediction,
+                    ensemble[-1],
+                    out=weighted_ensemble_prediction,
+                )
+
+            # Memory-efficient averaging!
+            for j, pred in enumerate(predictions):
+                # fant_ensemble_prediction is the prediction of the current ensemble
+                # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1)
+                # We overwrite the contents of fant_ensemble_prediction
+                # directly with weighted_ensemble_prediction + new_prediction and then scale for avg
+                np.add(
+                    weighted_ensemble_prediction,
+                    pred,
+                    out=fant_ensemble_prediction
+                )
+                np.multiply(
+                    fant_ensemble_prediction,
+                    (1. / float(s + 1)),
+                    out=fant_ensemble_prediction
+                )
+
+                # Calculate loss is versatile and can return a dict of slosses
+                losses[j] = calculate_loss(
+                    metrics=[self.metric],
+                    target=labels,
+                    prediction=fant_ensemble_prediction,
+                    task_type=self.task_type,
+                )[self.metric.name]
+
+            all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
+            best = self.random_state.choice(all_best)
+            ensemble.append(predictions[best])
+            trajectory.append(losses[best])
+            order.append(best)
+
+            # Handle special case
+            if len(predictions) == 1:
+                break
+
+        self.indices_: List[int] = order
+        self.trajectory_: List[float] = trajectory
+        self.train_loss_: float = trajectory[-1]
+
+    # TODO: return 1 for models in layer 0, 2 for next and so on
+    # TODO: 0 for models that are not in stack
+    def _calculate_weights(self) -> None:
+        """
+        Calculates the contribution each of the individual models
+        should have, in the final ensemble soft voting. It does so by
+        a frequency counting scheme. In particular, how many times a model
+        was used during hill climbing optimization.
+        """
+        ensemble_members = Counter(self.indices_).most_common()
+        weights = np.zeros(
+            (self.num_input_models_,),
+            dtype=np.float64,
+        )
+        for ensemble_member in ensemble_members:
+            weight = float(ensemble_member[1]) / self.ensemble_size
+            weights[ensemble_member[0]] = weight
+
+        if np.sum(weights) < 1:
+            weights = weights / np.sum(weights)
+
+        self.weights_ = weights
+
+    # TODO: Adjust this to use weights and make 
+    def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
+        """
+        Given a list of predictions from the individual model, this method
+        aggregates the predictions using a soft voting scheme with the weights
+        found during training.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of predictions from the individual base models.
+
+        Returns:
+            average (np.ndarray): Soft voting predictions of ensemble models, using
+                                the weights found during ensemble selection (self._weights)
+        """
+
+        average = np.zeros_like(predictions[0], dtype=np.float64)
+        tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)
+
+        # if predictions.shape[0] == len(self.weights_),
+        # predictions include those of zero-weight models.
+        if len(predictions) == len(self.weights_):
+            for pred, weight in zip(predictions, self.weights_):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # if prediction model.shape[0] == len(non_null_weights),
+        # predictions do not include those of zero-weight models.
+        elif len(predictions) == np.count_nonzero(self.weights_):
+            non_null_weights = [w for w in self.weights_ if w > 0]
+            for pred, weight in zip(predictions, non_null_weights):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # If none of the above applies, then something must have gone wrong.
+        else:
+            raise ValueError("The dimensions of ensemble predictions"
+                             " and ensemble weights do not match!")
+        del tmp_predictions
+        return average
+
+    def __str__(self) -> str:
+        return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
+               '\n\tWeights: %s\n\tIdentifiers: %s' % \
+               (' '.join(['%d: %5f' % (idx, performance)
+                         for idx, performance in enumerate(self.trajectory_)]),
+                self.indices_, self.weights_,
+                ' '.join([str(identifier) for idx, identifier in
+                          enumerate(self.identifiers_)
+                          if self.weights_[idx] > 0]))
+
+
+    def get_models_with_weights(
+        self,
+        models: Dict[Any, BasePipeline]
+    ) -> List[Tuple[float, BasePipeline]]:
+        """
+        Handy function to tag the provided input models with a given weight.
+
+        Args:
+            models (List[Tuple[float, BasePipeline]]):
+                A dictionary that maps a model's name to it's actual python object.
+
+        Returns:
+            output (List[Tuple[float, BasePipeline]]):
+                each model with the related weight, sorted by ascending
+                performance. Notice that ensemble selection solves a minimization
+                problem.
+        """
+        output = []
+        for i, weight in enumerate(self.weights_):
+            if weight > 0.0:
+                identifier = self.identifiers_[i]
+                model = models[identifier]
+                output.append((weight, model))
+
+        output.sort(reverse=True, key=lambda t: t[0])
+
+        return output
+
+    def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
+        """
+        After training of ensemble selection, not all models will be used.
+        Some of them will have zero weight. This procedure filters this models
+        out.
+
+        Returns:
+            output (List[Tuple[int, int, float]]):
+                The models actually used by ensemble selection
+        """
+        output = []
+
+        for i, weight in enumerate(self.weights_):
+            identifier = self.identifiers_[i]
+            if weight > 0.0:
+                output.append(identifier)
+
+        return output
+
+    def get_validation_performance(self) -> float:
+        """
+        Returns the best optimization performance seen during hill climbing
+
+        Returns:
+            (float):
+                best ensemble training performance
+        """
+        return self.trajectory_[-1]
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
new file mode 100644
index 000000000..39a96bbf6
--- /dev/null
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -0,0 +1,619 @@
+import glob
+import logging
+import logging.handlers
+import math
+import os
+import pickle
+import time
+import traceback
+import zlib
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import BINARY
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
+from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+from autoPyTorch.utils.logging_ import get_named_client_logger
+
+Y_ENSEMBLE = 0
+Y_TEST = 1
+
+MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
+
+
+# TODO: think of what functions are needed to support stacking
+# TODO: make functions to support stacking.
+class StackingEnsembleBuilder(EnsembleBuilder):
+    def __init__(
+        self,
+        backend: Backend,
+        dataset_name: str,
+        task_type: int,
+        output_type: int,
+        metrics: List[autoPyTorchMetric],
+        opt_metric: str,
+        ensemble_size: int = 10,
+        ensemble_nbest: int = 100,
+        max_models_on_disc: Union[float, int] = 100,
+        performance_range_threshold: float = 0,
+        seed: int = 1,
+        precision: int = 32,
+        memory_limit: Optional[int] = 1024,
+        read_at_most: int = 5,
+        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+        unit_test: bool = False,
+    ):
+        """
+            Constructor
+            Parameters
+            ----------
+            backend: util.backend.Backend
+                backend to write and read files
+            dataset_name: str
+                name of dataset
+            task_type: int
+                type of ML task
+            metrics: List[autoPyTorchMetric],
+                name of metric to score predictions
+            opt_metric: str
+                name of the metric to optimize
+            ensemble_size: int
+                maximal size of ensemble (passed to ensemble.ensemble_selection)
+            ensemble_nbest: int/float
+                if int: consider only the n best prediction
+                if float: consider only this fraction of the best models
+                Both wrt to validation predictions
+                If performance_range_threshold > 0, might return less models
+            max_models_on_disc: Union[float, int]
+               Defines the maximum number of models that are kept in the disc.
+               If int, it must be greater or equal than 1, and dictates the max number of
+               models to keep.
+               If float, it will be interpreted as the max megabytes allowed of disc space. That
+               is, if the number of ensemble candidates require more disc space than this float
+               value, the worst models will be deleted to keep within this budget.
+               Models and predictions of the worst-performing models will be deleted then.
+               If None, the feature is disabled.
+               It defines an upper bound on the models that can be used in the ensemble.
+            performance_range_threshold: float
+                Keep only models that are better than:
+                    dummy + (best - dummy)*performance_range_threshold
+                E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3
+                Will at most return the minimum between ensemble_nbest models,
+                and max_models_on_disc. Might return less
+            seed: int
+                random seed
+            precision: [16,32,64,128]
+                precision of floats to read the predictions
+            memory_limit: Optional[int]
+                memory limit in mb. If ``None``, no memory limit is enforced.
+            read_at_most: int
+                read at most n new prediction files in each iteration
+            logger_port: int
+                port that receives logging records
+            unit_test: bool
+                Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
+                Having this is very bad coding style, but I did not find a way to make
+                unittest.mock work through the pynisher with all spawn contexts. If you know a
+                better solution, please let us know by opening an issue.
+        """
+
+        super(StackingEnsembleBuilder, self).__init__(
+            backend=backend, dataset_name=dataset_name, task_type=task_type,
+            output_type=output_type, metrics=metrics, opt_metric=opt_metric,
+            ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            performance_range_threshold=performance_range_threshold,
+            seed=seed, precision=precision, memory_limit=memory_limit,
+            read_at_most=read_at_most, random_state=random_state,
+            logger_port=logger_port, unit_test=unit_test)
+
+    # TODO: This is the main wrapper to the EnsembleSelection class which fits
+    # TODO: the ensemble
+    def main(
+        self, time_left: float, iteration: int, return_predictions: bool,
+    ) -> Tuple[
+        List[Dict[str, float]],
+        int,
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+    ]:
+        """
+        This is the main function of the ensemble builder process and can be considered
+        a wrapper over the ensemble selection method implemented y EnsembleSelection class.
+
+        This method is going to be called multiple times by the main process, to
+        build and ensemble, in case the SMAC process produced new models and to provide
+        anytime results.
+
+        On this regard, this method mainly:
+            1- select from all the individual models that smac created, the N-best candidates
+               (this in the scenario that N > ensemble_nbest argument to this class). This is
+               done based on a score calculated via the metrics argument.
+            2- This pre-selected candidates are provided to the ensemble selection method
+               and if a ensemble is found under the provided memory/time constraints, a new
+               ensemble is proposed.
+            3- Because this process will be called multiple times, it performs checks to make
+               sure a new ensenmble is only proposed if new predictions are available, as well
+               as making sure we do not run out of resources (like disk space)
+
+        Args:
+            time_left (float):
+                How much time is left for the ensemble builder process
+            iteration (int):
+                Which is the current iteration
+            return_predictions (bool):
+                Whether we want to return the predictions of the current model or not
+
+        Returns:
+            ensemble_history (Dict):
+                A snapshot of both test and optimization performance. For debugging.
+            ensemble_nbest (int):
+                The user provides a direction on how many models to use in ensemble selection.
+                This number can be reduced internally if the memory requirements force it.
+            train_predictions (np.ndarray):
+                The optimization prediction from the current ensemble.
+            test_predictions (np.ndarray):
+                The train prediction from the current ensemble.
+        """
+
+        # Pynisher jobs inside dask 'forget'
+        # the logger configuration. So we have to set it up
+        # accordingly
+        self.logger = get_named_client_logger(
+            name='EnsembleBuilder',
+            port=self.logger_port,
+        )
+
+        self.start_time = time.time()
+        train_pred, test_pred = None, None
+
+        used_time = time.time() - self.start_time
+        self.logger.debug(
+            'Starting iteration %d, time left: %f',
+            iteration,
+            time_left - used_time,
+        )
+
+        # populates self.read_preds and self.read_losses
+        if not self.compute_loss_per_model():
+            if return_predictions:
+                return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+            else:
+                return self.ensemble_history, self.ensemble_nbest, None, None
+
+        # Only the models with the n_best predictions are candidates
+        # to be in the ensemble
+        candidate_models = self.get_n_best_preds()
+        if not candidate_models:  # no candidates yet
+            if return_predictions:
+                return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+            else:
+                return self.ensemble_history, self.ensemble_nbest, None, None
+
+        # populates predictions in self.read_preds
+        # reduces selected models if file reading failed
+        n_sel_test = self.get_test_preds(selected_keys=candidate_models)
+
+        # If any of n_sel_* is not empty and overlaps with candidate_models,
+        # then ensure candidate_models AND n_sel_test are sorted the same
+        candidate_models_set = set(candidate_models)
+        if candidate_models_set.intersection(n_sel_test):
+            candidate_models = sorted(list(candidate_models_set.intersection(
+                n_sel_test)))
+            n_sel_test = candidate_models
+        else:
+            # This has to be the case
+            n_sel_test = []
+
+        if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'):
+            for candidate in candidate_models:
+                self._has_been_candidate.add(candidate)
+
+        # train ensemble
+        ensemble = self.fit_ensemble(selected_keys=candidate_models)
+
+        # Save the ensemble for later use in the main module!
+        if ensemble is not None and self.SAVE2DISC:
+            self.backend.save_ensemble(ensemble, iteration, self.seed)
+
+        # Delete files of non-candidate models - can only be done after fitting the ensemble and
+        # saving it to disc so we do not accidentally delete models in the previous ensemble
+        if self.max_resident_models is not None:
+            self._delete_excess_models(selected_keys=candidate_models)
+
+        # Save the read losses status for the next iteration
+        with open(self.ensemble_loss_file, "wb") as memory:
+            pickle.dump(self.read_losses, memory)
+
+        if ensemble is not None:
+            train_pred = self.predict(set_="train",
+                                      ensemble=ensemble,
+                                      selected_keys=candidate_models,
+                                      n_preds=len(candidate_models),
+                                      index_run=iteration)
+            # TODO if predictions fails, build the model again during the
+            #  next iteration!
+            test_pred = self.predict(set_="test",
+                                     ensemble=ensemble,
+                                     selected_keys=n_sel_test,
+                                     n_preds=len(candidate_models),
+                                     index_run=iteration)
+
+            # Add a score to run history to see ensemble progress
+            self._add_ensemble_trajectory(
+                train_pred,
+                test_pred
+            )
+
+        # The loaded predictions and the hash can only be saved after the ensemble has been
+        # built, because the hash is computed during the construction of the ensemble
+        with open(self.ensemble_memory_file, "wb") as memory:
+            pickle.dump((self.read_preds, self.last_hash), memory)
+
+        if return_predictions:
+            return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+        else:
+            return self.ensemble_history, self.ensemble_nbest, None, None
+
+    def get_disk_consumption(self, pred_path: str) -> float:
+        """
+        gets the cost of a model being on disc
+        """
+
+        match = self.model_fn_re.search(pred_path)
+        if not match:
+            raise ValueError("Invalid path format %s" % pred_path)
+        _seed = int(match.group(1))
+        _num_run = int(match.group(2))
+        _budget = float(match.group(3))
+
+        stored_files_for_run = os.listdir(
+            self.backend.get_numrun_directory(_seed, _num_run, _budget))
+        stored_files_for_run = [
+            os.path.join(self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name)
+            for file_name in stored_files_for_run]
+        this_model_cost = sum([os.path.getsize(path) for path in stored_files_for_run])
+
+        # get the megabytes
+        return round(this_model_cost / math.pow(1024, 2), 2)
+
+    # TODO: change this function, to compute loss according to Lavesque et al.
+    # TODO: this will help us in choosing the model with the lowest ensemble error.
+    def compute_loss_per_model(self) -> bool:
+        """
+            Compute the loss of the predictions on ensemble building data set;
+            populates self.read_preds and self.read_losses
+        """
+
+        self.logger.debug("Read ensemble data set predictions")
+
+        if self.y_true_ensemble is None:
+            try:
+                self.y_true_ensemble = self.backend.load_targets_ensemble()
+            except FileNotFoundError:
+                self.logger.debug(
+                    "Could not find true targets on ensemble data set: %s",
+                    traceback.format_exc(),
+                )
+                return False
+
+        pred_path = os.path.join(
+            glob.escape(self.backend.get_runs_directory()),
+            '%d_*_*' % self.seed,
+            'predictions_ensemble_%s_*_*.npy*' % self.seed,
+        )
+        y_ens_files = glob.glob(pred_path)
+        y_ens_files = [y_ens_file for y_ens_file in y_ens_files
+                       if y_ens_file.endswith('.npy') or y_ens_file.endswith('.npy.gz')]
+        self.y_ens_files = y_ens_files
+        # no validation predictions so far -- no files
+        if len(self.y_ens_files) == 0:
+            self.logger.debug("Found no prediction files on ensemble data set:"
+                              " %s" % pred_path)
+            return False
+
+        # First sort files chronologically
+        to_read = []
+        for y_ens_fn in self.y_ens_files:
+            match = self.model_fn_re.search(y_ens_fn)
+            if match is None:
+                raise ValueError(f"Could not interpret file {y_ens_fn} "
+                                 "Something went wrong while scoring predictions")
+            _seed = int(match.group(1))
+            _num_run = int(match.group(2))
+            _budget = float(match.group(3))
+
+            to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
+
+        n_read_files = 0
+        # Now read file wrt to num_run
+        # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
+        # as a returning list, so as a work-around we skip next line
+        for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
+            if self.read_at_most and n_read_files >= self.read_at_most:
+                # limit the number of files that will be read
+                # to limit memory consumption
+                break
+
+            if not y_ens_fn.endswith(".npy") and not y_ens_fn.endswith(".npy.gz"):
+                self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn)
+                continue
+
+            if not self.read_losses.get(y_ens_fn):
+                self.read_losses[y_ens_fn] = {
+                    "ens_loss": np.inf,
+                    "mtime_ens": 0,
+                    "mtime_test": 0,
+                    "seed": _seed,
+                    "num_run": _num_run,
+                    "budget": _budget,
+                    "disc_space_cost_mb": None,
+                    # Lazy keys so far:
+                    # 0 - not loaded
+                    # 1 - loaded and in memory
+                    # 2 - loaded but dropped again
+                    # 3 - deleted from disk due to space constraints
+                    "loaded": 0
+                }
+            if not self.read_preds.get(y_ens_fn):
+                self.read_preds[y_ens_fn] = {
+                    Y_ENSEMBLE: None,
+                    Y_TEST: None,
+                }
+
+            if self.read_losses[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn):
+                # same time stamp; nothing changed;
+                continue
+
+            # actually read the predictions and compute their respective loss
+            try:
+                y_ensemble = self._read_np_fn(y_ens_fn)
+                losses = calculate_loss(
+                    metrics=self.metrics,
+                    target=self.y_true_ensemble,
+                    prediction=y_ensemble,
+                    task_type=self.task_type,
+                )
+
+                if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]):
+                    self.logger.debug(
+                        'Changing ensemble loss for file %s from %f to %f '
+                        'because file modification time changed? %f - %f',
+                        y_ens_fn,
+                        self.read_losses[y_ens_fn]["ens_loss"],
+                        losses[self.opt_metric],
+                        self.read_losses[y_ens_fn]["mtime_ens"],
+                        os.path.getmtime(y_ens_fn),
+                    )
+
+                self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
+
+                # It is not needed to create the object here
+                # To save memory, we just compute the loss.
+                self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn)
+                self.read_losses[y_ens_fn]["loaded"] = 2
+                self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption(
+                    y_ens_fn
+                )
+
+                n_read_files += 1
+
+            except Exception:
+                self.logger.warning(
+                    'Error loading %s: %s',
+                    y_ens_fn,
+                    traceback.format_exc(),
+                )
+                self.read_losses[y_ens_fn]["ens_loss"] = np.inf
+
+        self.logger.debug(
+            'Done reading %d new prediction files. Loaded %d predictions in '
+            'total.',
+            n_read_files,
+            np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()])
+        )
+        return True
+
+    def get_test_preds(self, selected_keys: List[str]) -> List[str]:
+        """
+        test predictions from disc
+        and store them in self.read_preds
+        Parameters
+        ---------
+        selected_keys: list
+            list of selected keys of self.read_preds
+        Return
+        ------
+        success_keys:
+            all keys in selected keys for which we could read the valid and
+            test predictions
+        """
+        success_keys_test = []
+
+        for k in selected_keys:
+            test_fn = glob.glob(
+                os.path.join(
+                    glob.escape(self.backend.get_runs_directory()),
+                    '%d_%d_%s' % (
+                        self.read_losses[k]["seed"],
+                        self.read_losses[k]["num_run"],
+                        self.read_losses[k]["budget"],
+                    ),
+                    'predictions_test_%d_%d_%s.npy*' % (
+                        self.read_losses[k]["seed"],
+                        self.read_losses[k]["num_run"],
+                        self.read_losses[k]["budget"]
+                    )
+                )
+            )
+            test_fn = [tfn for tfn in test_fn if tfn.endswith('.npy') or tfn.endswith('.npy.gz')]
+
+            if len(test_fn) == 0:
+                # self.logger.debug("Not found test prediction file (although "
+                #                   "ensemble predictions available):%s" %
+                #                   test_fn)
+                pass
+            else:
+                if (
+                    self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn[0])
+                    and k in self.read_preds
+                    and self.read_preds[k][Y_TEST] is not None
+                ):
+                    success_keys_test.append(k)
+                    continue
+                try:
+                    y_test = self._read_np_fn(test_fn[0])
+                    self.read_preds[k][Y_TEST] = y_test
+                    success_keys_test.append(k)
+                    self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn[0])
+                except Exception:
+                    self.logger.warning('Error loading %s: %s',
+                                        test_fn, traceback.format_exc())
+
+        return success_keys_test
+
+    def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
+        """
+            fit ensemble
+
+            Parameters
+            ---------
+            selected_keys: list
+                list of selected keys of self.read_losses
+
+            Returns
+            -------
+            ensemble: EnsembleSelection
+                trained Ensemble
+        """
+
+        if self.unit_test:
+            raise MemoryError()
+
+        predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys]
+        include_num_runs = [
+            (
+                self.read_losses[k]["seed"],
+                self.read_losses[k]["num_run"],
+                self.read_losses[k]["budget"],
+            )
+            for k in selected_keys]
+
+        # check hash if ensemble training data changed
+        current_hash = "".join([
+            str(zlib.adler32(predictions_train[i].data.tobytes()))
+            for i in range(len(predictions_train))
+        ])
+        if self.last_hash == current_hash:
+            self.logger.debug(
+                "No new model predictions selected -- skip ensemble building "
+                "-- current performance: %f",
+                self.validation_performance_,
+            )
+
+            return None
+        self.last_hash = current_hash
+
+        opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0]
+        if not opt_metric:
+            raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} "
+                             "as more than one unique optimization metric was found.")
+
+        ensemble = EnsembleSelection(
+            ensemble_size=self.ensemble_size,
+            metric=opt_metric,
+            random_state=self.random_state,
+            task_type=self.task_type,
+        )
+
+        try:
+            self.logger.debug(
+                "Fitting the ensemble on %d models.",
+                len(predictions_train),
+            )
+            start_time = time.time()
+            ensemble.fit(predictions_train, self.y_true_ensemble,
+                         include_num_runs)
+            end_time = time.time()
+            self.logger.debug(
+                "Fitting the ensemble took %.2f seconds.",
+                end_time - start_time,
+            )
+            self.logger.info(str(ensemble))
+            self.validation_performance_ = min(
+                self.validation_performance_,
+                ensemble.get_validation_performance(),
+            )
+
+        except ValueError:
+            self.logger.error('Caught ValueError: %s', traceback.format_exc())
+            return None
+        except IndexError:
+            self.logger.error('Caught IndexError: %s' + traceback.format_exc())
+            return None
+        finally:
+            # Explicitly free memory
+            del predictions_train
+
+        return ensemble
+
+    def predict(self, set_: str,
+                ensemble: AbstractEnsemble,
+                selected_keys: list,
+                n_preds: int,
+                index_run: int) -> np.ndarray:
+        """
+            save preditions on ensemble, validation and test data on disc
+            Parameters
+            ----------
+            set_: ["test"]
+                data split name
+            ensemble: EnsembleSelection
+                trained Ensemble
+            selected_keys: list
+                list of selected keys of self.read_losses
+            n_preds: int
+                number of prediction models used for ensemble building
+                same number of predictions on valid and test are necessary
+            index_run: int
+                n-th time that ensemble predictions are written to disc
+            Return
+            ------
+            y: np.ndarray
+        """
+        self.logger.debug("Predicting the %s set with the ensemble!", set_)
+
+        if set_ == 'test':
+            pred_set = Y_TEST
+        else:
+            pred_set = Y_ENSEMBLE
+        predictions = [self.read_preds[k][pred_set] for k in selected_keys]
+
+        if n_preds == len(predictions):
+            y = ensemble.predict(predictions)
+            if self.output_type == BINARY:
+                y = y[:, 1]
+            if self.SAVE2DISC:
+                self.backend.save_predictions_as_txt(
+                    predictions=y,
+                    subset=set_,
+                    idx=index_run,
+                    prefix=self.dataset_name,
+                    precision=8,
+                )
+            return y
+        else:
+            self.logger.info(
+                "Found inconsistent number of predictions and models (%d vs "
+                "%d) for subset %s",
+                len(predictions),
+                n_preds,
+                set_,
+            )
+            return None
\ No newline at end of file
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
new file mode 100644
index 000000000..bf842ecb9
--- /dev/null
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -0,0 +1,359 @@
+from multiprocessing.queues import Queue
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    MULTICLASSMULTIOUTPUT,
+)
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
+from autoPyTorch.evaluation.abstract_evaluator import (
+    AbstractEvaluator,
+    fit_and_suppress_warnings
+)
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.common import dict_repr, subsampler
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+__all__ = ['StackingEvaluator', 'eval_function']
+
+
+def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray:
+    if task_type in CLASSIFICATION_TASKS and task_type != \
+            MULTICLASSMULTIOUTPUT:
+        return y.ravel()
+    else:
+        return y
+
+
+class StackingEvaluator(AbstractEvaluator):
+    """
+    This class builds a pipeline using the provided configuration.
+    A pipeline implementing the provided configuration is fitted
+    using the datamanager object retrieved from disc, via the backend.
+    After the pipeline is fitted, it is save to disc and the performance estimate
+    is communicated to the main process via a Queue.
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        configuration (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed. A dummy estimator is created for
+            integer configurations, a traditional machine learning pipeline is created
+            for string based configuration, and NAS is performed when a configuration
+            object is passed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        all_supported_metrics  (bool):
+            Whether all supported metric should be calculated for every configuration.
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            An object used to fine tune the hyperparameter search space of the pipeline
+    """
+    def __init__(self, backend: Backend, queue: Queue,
+                 metric: autoPyTorchMetric,
+                 budget: float,
+                 configuration: Union[int, str, Configuration],
+                 budget_type: str = None,
+                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 seed: int = 1,
+                 output_y_hat_optimization: bool = True,
+                 num_run: Optional[int] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 disable_file_output: Union[bool, List] = False,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 logger_port: Optional[int] = None,
+                 keep_models: Optional[bool] = None,
+                 all_supported_metrics: bool = True,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
+        super().__init__(
+            backend=backend,
+            queue=queue,
+            configuration=configuration,
+            metric=metric,
+            seed=seed,
+            output_y_hat_optimization=output_y_hat_optimization,
+            num_run=num_run,
+            include=include,
+            exclude=exclude,
+            disable_file_output=disable_file_output,
+            init_params=init_params,
+            budget=budget,
+            budget_type=budget_type,
+            logger_port=logger_port,
+            all_supported_metrics=all_supported_metrics,
+            pipeline_config=pipeline_config,
+            search_space_updates=search_space_updates
+        )
+
+        # TODO: we cant store the ensemble pipelines with this class as it is initialised for every TAE (target algorithm evaluation).
+        # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions.
+        # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something
+
+        self.splits = self.datamanager.splits
+        if self.splits is None:
+            raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
+        self.num_folds: int = len(self.splits)
+        self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+        self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
+        self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
+        self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds
+
+        self.logger.debug("Search space updates :{}".format(self.search_space_updates))
+        self.keep_models = keep_models
+
+    def fit_predict_and_loss(self) -> None:
+        """Fit, predict and compute the loss for cross-validation and
+        holdout"""
+        assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
+            .format(self.__class__.__name__)
+        additional_run_info: Optional[Dict] = None
+        split_id = 0
+        self.logger.info("Starting fit {}".format(split_id))
+
+        pipeline = self._get_pipeline()
+
+        train_split, test_split = self.splits[split_id]
+        self.Y_optimization = self.y_train[test_split]
+        self.Y_actual_train = self.y_train[train_split]
+        y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
+                                                                                    train_indices=train_split,
+                                                                                    test_indices=test_split,
+                                                                                    add_pipeline_to_self=True)
+        train_loss = self._loss(self.y_train[train_split], y_train_pred)
+        loss = self._loss(self.y_train[test_split], y_opt_pred)
+
+        additional_run_info = pipeline.get_additional_run_info() if hasattr(
+            pipeline, 'get_additional_run_info') else {}
+
+        status = StatusType.SUCCESS
+
+        self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{},"
+                            " status: {},\nadditional run info:\n{}".format(self.num_run,
+                                                                            loss,
+                                                                            dict_repr(additional_run_info),
+                                                                            status))
+        self.finish_up(
+            loss=loss,
+            train_loss=train_loss,
+            opt_pred=y_opt_pred,
+            valid_pred=y_valid_pred,
+            test_pred=y_test_pred,
+            additional_run_info=additional_run_info,
+            file_output=True,
+            status=status,
+        )
+
+
+    def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List],
+                         test_indices: Union[np.ndarray, List],
+                         add_pipeline_to_self: bool
+                         ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+        self.indices[fold] = ((train_indices, test_indices))
+
+        # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
+        # about fit_dictionary
+        X = {'train_indices': train_indices,
+             'val_indices': test_indices,
+             'split_id': fold,
+             'num_run': self.num_run,
+             **self.fit_dictionary}  # fit dictionary
+        y = None
+        fit_and_suppress_warnings(self.logger, pipeline, X, y)
+        self.logger.info("Model fitted, now predicting")
+        (
+            Y_train_pred,
+            Y_opt_pred,
+            Y_valid_pred,
+            Y_test_pred
+        ) = self._predict(
+            pipeline,
+            train_indices=train_indices,
+            test_indices=test_indices,
+        )
+
+        if add_pipeline_to_self:
+            self.pipeline = pipeline
+        else:
+            self.pipelines[fold] = pipeline
+
+        return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
+
+    def _predict(self, pipeline: BaseEstimator,
+                 test_indices: Union[np.ndarray, List],
+                 train_indices: Union[np.ndarray, List]
+                 ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+        # TODO: load ensemble members and predict using the whole ensemble.
+
+        train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
+                                           self.y_train[train_indices])
+
+        opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
+                                         self.y_train[train_indices])
+
+        if self.X_valid is not None:
+            valid_pred = self.predict_function(self.X_valid, pipeline,
+                                               self.y_valid)
+        else:
+            valid_pred = None
+
+        if self.X_test is not None:
+            test_pred = self.predict_function(self.X_test, pipeline,
+                                              self.y_train[train_indices])
+        else:
+            test_pred = None
+
+        return train_pred, opt_pred, valid_pred, test_pred
+
+
+# create closure for evaluating an algorithm
+def eval_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Union[bool, List],
+    output_y_hat_optimization: bool,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
+) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+    """
+    evaluator = StackingEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates
+    )
+    evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index d790237b7..b6242e379 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -24,7 +24,7 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes
 )
-from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
+from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.optimizer.utils import read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
diff --git a/test/test_ensemble/ensemble_utils.py b/test/test_ensemble/ensemble_utils.py
index 7b0ab7fb8..addfdd762 100644
--- a/test/test_ensemble/ensemble_utils.py
+++ b/test/test_ensemble/ensemble_utils.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from autoPyTorch.ensemble.ensemble_builder import (
+from autoPyTorch.ensemble.ensemble_builder_manager import (
     AbstractEnsemble,
     EnsembleBuilder,
 )
diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py
index 402659e08..d8463ab86 100644
--- a/test/test_ensemble/test_ensemble.py
+++ b/test/test_ensemble/test_ensemble.py
@@ -18,7 +18,7 @@
 from smac.runhistory.runhistory import RunHistory, RunKey, RunValue
 
 from autoPyTorch.constants import BINARY, MULTICLASS, TABULAR_CLASSIFICATION
-from autoPyTorch.ensemble.ensemble_builder import (
+from autoPyTorch.ensemble.ensemble_builder_manager import (
     EnsembleBuilder,
     EnsembleBuilderManager,
     Y_ENSEMBLE,

From 16b10f0f2c8ec5cfb3b05687e03ead65e117ebd3 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:30:36 +0200
Subject: [PATCH 02/16] WIP: done changes in stackingensemblebuilder, todo:
 stackingensemble

---
 autoPyTorch/api/base_task.py                  |   5 +-
 autoPyTorch/ensemble/ensemble_builder.py      | 167 +-------------
 autoPyTorch/ensemble/stacking_ensemble.py     | 140 +++--------
 .../ensemble/stacking_ensemble_builder.py     | 218 ++++++++++++++----
 autoPyTorch/evaluation/stacking_evaluator.py  | 131 ++++++++++-
 .../components/training/metrics/metrics.py    |   8 +-
 test/test_ensemble/test_ensemble.py           |   4 +-
 7 files changed, 353 insertions(+), 320 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index c7c99d5e1..514af72d2 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1872,6 +1872,9 @@ def _init_ensemble_builder(
         # builder in the provide dask client
         required_dataset_properties = {'task_type': self.task_type,
                                        'output_type': self.dataset.output_type}
+        metrics = get_metrics(
+                dataset_properties=required_dataset_properties, names=[optimize_metric])
+        self._logger.info(f"metrics are {metrics}")
         proc_ensemble = EnsembleBuilderManager(
             start_time=time.time(),
             time_left_for_ensembles=time_left_for_ensembles,
@@ -1879,7 +1882,7 @@ def _init_ensemble_builder(
             dataset_name=str(self.dataset.dataset_name),
             output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type],
             task_type=STRING_TO_TASK_TYPES[self.task_type],
-            metrics=[self._metric] if self._metric is not None else get_metrics(
+            metrics=get_metrics(
                 dataset_properties=required_dataset_properties, names=[optimize_metric]),
             opt_metric=optimize_metric,
             ensemble_size=ensemble_size,
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index 311069e50..87ec9a2b9 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -38,6 +38,7 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
+# TODO: store
 class EnsembleBuilder(object):
     def __init__(
         self,
@@ -200,6 +201,8 @@ def __init__(
         # from dask, it builds this object from scratch)
         # we save the state of this dictionary to memory
         # and read it if available
+        # TODO: ensemble_read_preds comes from self.predict,
+        # see line #513
         self.ensemble_memory_file = os.path.join(
             self.backend.internals_directory,
             'ensemble_read_preds.pkl'
@@ -646,6 +649,7 @@ def compute_loss_per_model(self) -> bool:
                         os.path.getmtime(y_ens_fn),
                     )
 
+                self.logger.debug(f"keys in losses {losses.keys()}")
                 self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
 
                 # It is not needed to create the object here
@@ -674,167 +678,7 @@ def compute_loss_per_model(self) -> bool:
         )
         return True
 
-    def get_n_best_preds(self) -> List[str]:
-        """
-            get best n predictions (i.e., keys of self.read_losses)
-            according to the loss on the "ensemble set"
-            n: self.ensemble_nbest
-
-            Side effects:
-                ->Define the n-best models to use in ensemble
-                ->Only the best models are loaded
-                ->Any model that is not best is candidate to deletion
-                  if max models in disc is exceeded.
-        """
-
-        sorted_keys = self._get_list_of_sorted_preds()
-
-        # number of models available
-        num_keys = len(sorted_keys)
-        # remove all that are at most as good as random
-        # note: dummy model must have run_id=1 (there is no run_id=0)
-        dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys))
-        # Leave this here for when we enable dummy classifier/scorer
-        if len(dummy_losses) > 0:
-            # number of dummy models
-            num_dummy = len(dummy_losses)
-            dummy_loss = dummy_losses[0]
-            self.logger.debug("Use %f as dummy loss" % dummy_loss[1])
-            sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys))
-
-            # remove Dummy Classifier
-            sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys))
-            if len(sorted_keys) == 0:
-                # no model left; try to use dummy loss (num_run==0)
-                # log warning when there are other models but not better than dummy model
-                if num_keys > num_dummy:
-                    self.logger.warning("No models better than random - using Dummy Score!"
-                                        "Number of models besides current dummy model: %d. "
-                                        "Number of dummy models: %d",
-                                        num_keys - 1,
-                                        num_dummy)
-                sorted_keys = [
-                    (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items()
-                    if v["seed"] == self.seed and v["num_run"] == 1
-                ]
-        # reload predictions if losses changed over time and a model is
-        # considered to be in the top models again!
-        if not isinstance(self.ensemble_nbest, numbers.Integral):
-            # Transform to number of models to keep. Keep at least one
-            keep_nbest = max(1, min(len(sorted_keys),
-                                    int(len(sorted_keys) * self.ensemble_nbest)))
-            self.logger.debug(
-                "Library pruning: using only top %f percent of the models for ensemble "
-                "(%d out of %d)",
-                self.ensemble_nbest * 100, keep_nbest, len(sorted_keys)
-            )
-        else:
-            # Keep only at most ensemble_nbest
-            keep_nbest = min(self.ensemble_nbest, len(sorted_keys))
-            self.logger.debug("Library Pruning: using for ensemble only "
-                              " %d (out of %d) models" % (keep_nbest, len(sorted_keys)))
-
-        # If max_models_on_disc is None, do nothing
-        # One can only read at most max_models_on_disc models
-        if self.max_models_on_disc is not None:
-            if not isinstance(self.max_models_on_disc, numbers.Integral):
-                consumption = [
-                    [
-                        v["ens_loss"],
-                        v["disc_space_cost_mb"],
-                    ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None
-                ]
-                max_consumption = max(c[1] for c in consumption)
-
-                # We are pessimistic with the consumption limit indicated by
-                # max_models_on_disc by 1 model. Such model is assumed to spend
-                # max_consumption megabytes
-                if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc:
-
-                    # just leave the best -- smaller is better!
-                    # This list is in descending order, to preserve the best models
-                    sorted_cum_consumption = np.cumsum([
-                        c[1] for c in list(sorted(consumption))
-                    ]) + max_consumption
-                    max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc)
-
-                    # Make sure that at least 1 model survives
-                    self.max_resident_models = max(1, max_models)
-                    self.logger.warning(
-                        "Limiting num of models via float max_models_on_disc={}"
-                        " as accumulated={} worst={} num_models={}".format(
-                            self.max_models_on_disc,
-                            (sum(c[1] for c in consumption) + max_consumption),
-                            max_consumption,
-                            self.max_resident_models
-                        )
-                    )
-                else:
-                    self.max_resident_models = None
-            else:
-                self.max_resident_models = self.max_models_on_disc
-
-        if self.max_resident_models is not None and keep_nbest > self.max_resident_models:
-            self.logger.debug(
-                "Restricting the number of models to %d instead of %d due to argument "
-                "max_models_on_disc",
-                self.max_resident_models, keep_nbest,
-            )
-            keep_nbest = self.max_resident_models
-
-        # consider performance_range_threshold
-        if self.performance_range_threshold > 0:
-            best_loss = sorted_keys[0][1]
-            worst_loss = dummy_loss[1]
-            worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold
-            if sorted_keys[keep_nbest - 1][1] > worst_loss:
-                # We can further reduce number of models
-                # since worst model is worse than thresh
-                for i in range(0, keep_nbest):
-                    # Look at most at keep_nbest models,
-                    # but always keep at least one model
-                    current_loss = sorted_keys[i][1]
-                    if current_loss >= worst_loss:
-                        self.logger.debug("Dynamic Performance range: "
-                                          "Further reduce from %d to %d models",
-                                          keep_nbest, max(1, i))
-                        keep_nbest = max(1, i)
-                        break
-        ensemble_n_best = keep_nbest
-
-        # reduce to keys
-        reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys))
-
-        # remove loaded predictions for non-winning models
-        for k in reduced_sorted_keys[ensemble_n_best:]:
-            if k in self.read_preds:
-                self.read_preds[k][Y_ENSEMBLE] = None
-                self.read_preds[k][Y_TEST] = None
-            if self.read_losses[k]['loaded'] == 1:
-                self.logger.debug(
-                    'Dropping model %s (%d,%d) with loss %f.',
-                    k,
-                    self.read_losses[k]['seed'],
-                    self.read_losses[k]['num_run'],
-                    self.read_losses[k]['ens_loss'],
-                )
-                self.read_losses[k]['loaded'] = 2
-
-        # Load the predictions for the winning
-        for k in reduced_sorted_keys[:ensemble_n_best]:
-            if (
-                (
-                    k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None
-                )
-                and self.read_losses[k]['loaded'] != 3
-            ):
-                self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k)
-                # No need to load test here because they are loaded
-                #  only if the model ends up in the ensemble
-                self.read_losses[k]['loaded'] = 1
-
-        # return best scored keys of self.read_losses
-        return reduced_sorted_keys[:ensemble_n_best]
+    
 
     def get_test_preds(self, selected_keys: List[str]) -> List[str]:
         """
@@ -1104,6 +948,7 @@ def _get_list_of_sorted_preds(self) -> List[Tuple[str, float, int]]:
             # We want small num_run first
             key=lambda x: (x[1], x[2]),
         ))
+        self.logger.debug(f"Selected keys: {sorted_keys}")
         return sorted_keys
 
     def _delete_excess_models(self, selected_keys: List[str]) -> None:
diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
index 425d2d8ba..4d6987eb1 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
+from sklearn.base import BaseEstimator
 
 from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
@@ -21,6 +22,11 @@ def __init__(
         metric: autoPyTorchMetric,
         task_type: int,
         random_state: np.random.RandomState,
+        # should be with something like numrun_seed_budget.
+        ensemble_identifiers = None,
+        best_model_identifier = None,
+        ensemble_slot_j: int = None,
+        read_preds = None,
     ) -> None:
         self.ensemble_size = ensemble_size
         self.metric = metric
@@ -40,9 +46,11 @@ def __getstate__(self) -> Dict[str, Any]:
 
     def fit(
         self,
-        predictions: List[np.ndarray],
+        predictions_ensemble: List[np.ndarray],
+        best_model_predictions: np.ndarray,
         labels: np.ndarray,
-        identifiers: List[Tuple[int, int, float]],
+        ensemble_identifiers: List[Tuple[int, int, float]],
+        best_model_identifier: Tuple[int, int, float]
     ) -> AbstractEnsemble:
         """
         Builds a ensemble given the individual models out of fold predictions.
@@ -62,13 +70,7 @@ def fit(
         Returns:
             A copy of self
         """
-        self.ensemble_size = int(self.ensemble_size)
-        if self.ensemble_size < 1:
-            raise ValueError('Ensemble size cannot be less than one!')
 
-        self._fit(predictions, labels)
-        self._calculate_weights()
-        self.identifiers_ = identifiers
         return self
 
     # TODO: fit a stacked ensemble.
@@ -78,10 +80,10 @@ def _fit(
         labels: np.ndarray,
     ) -> None:
         """
-        Fast version of Rich Caruana's ensemble selection method.
+        Implemenation of Lévesque et al.
 
         For more details, please check the paper
-        "Ensemble Selection from Library of Models" by R Caruana  (2004)
+        "Bayesian hyperparameter optimization for ensemble learning" by Lévesque (2004)
 
         Args:
             predictions (List[np.ndarray]):
@@ -103,57 +105,21 @@ def _fit(
             predictions[0].shape,
             dtype=np.float64,
         )
-        fant_ensemble_prediction = np.zeros(
-            weighted_ensemble_prediction.shape,
-            dtype=np.float64,
-        )
-        for i in range(ensemble_size):
-            losses = np.zeros(
-                (len(predictions)),
-                dtype=np.float64,
-            )
-            s = len(ensemble)
-            if s > 0:
-                np.add(
-                    weighted_ensemble_prediction,
-                    ensemble[-1],
-                    out=weighted_ensemble_prediction,
-                )
-
-            # Memory-efficient averaging!
-            for j, pred in enumerate(predictions):
-                # fant_ensemble_prediction is the prediction of the current ensemble
-                # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1)
-                # We overwrite the contents of fant_ensemble_prediction
-                # directly with weighted_ensemble_prediction + new_prediction and then scale for avg
-                np.add(
-                    weighted_ensemble_prediction,
-                    pred,
-                    out=fant_ensemble_prediction
-                )
-                np.multiply(
-                    fant_ensemble_prediction,
-                    (1. / float(s + 1)),
-                    out=fant_ensemble_prediction
-                )
-
-                # Calculate loss is versatile and can return a dict of slosses
-                losses[j] = calculate_loss(
-                    metrics=[self.metric],
-                    target=labels,
-                    prediction=fant_ensemble_prediction,
-                    task_type=self.task_type,
-                )[self.metric.name]
-
-            all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
-            best = self.random_state.choice(all_best)
-            ensemble.append(predictions[best])
-            trajectory.append(losses[best])
-            order.append(best)
-
-            # Handle special case
-            if len(predictions) == 1:
-                break
+
+        # Calculate loss is versatile and can return a dict of slosses
+        # losses[j] = calculate_loss(
+        #     metrics=[self.metric],
+        #     target=labels,
+        #     prediction=fant_ensemble_prediction,
+        #     task_type=self.task_type,
+        # )[self.metric.name]
+
+        # all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
+        # best = self.random_state.choice(all_best)
+        # ensemble.append(predictions[best])
+        # trajectory.append(losses[best])
+        # order.append(best)
+
 
         self.indices_: List[int] = order
         self.trajectory_: List[float] = trajectory
@@ -174,12 +140,9 @@ def _calculate_weights(self) -> None:
             dtype=np.float64,
         )
         for ensemble_member in ensemble_members:
-            weight = float(ensemble_member[1]) / self.ensemble_size
+            weight = 1
             weights[ensemble_member[0]] = weight
 
-        if np.sum(weights) < 1:
-            weights = weights / np.sum(weights)
-
         self.weights_ = weights
 
     # TODO: Adjust this to use weights and make 
@@ -201,16 +164,9 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
         average = np.zeros_like(predictions[0], dtype=np.float64)
         tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)
 
-        # if predictions.shape[0] == len(self.weights_),
-        # predictions include those of zero-weight models.
-        if len(predictions) == len(self.weights_):
-            for pred, weight in zip(predictions, self.weights_):
-                np.multiply(pred, weight, out=tmp_predictions)
-                np.add(average, tmp_predictions, out=average)
-
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
-        elif len(predictions) == np.count_nonzero(self.weights_):
+        if len(predictions) == np.count_nonzero(self.weights_):
             non_null_weights = [w for w in self.weights_ if w > 0]
             for pred, weight in zip(predictions, non_null_weights):
                 np.multiply(pred, weight, out=tmp_predictions)
@@ -233,35 +189,6 @@ def __str__(self) -> str:
                           enumerate(self.identifiers_)
                           if self.weights_[idx] > 0]))
 
-
-    def get_models_with_weights(
-        self,
-        models: Dict[Any, BasePipeline]
-    ) -> List[Tuple[float, BasePipeline]]:
-        """
-        Handy function to tag the provided input models with a given weight.
-
-        Args:
-            models (List[Tuple[float, BasePipeline]]):
-                A dictionary that maps a model's name to it's actual python object.
-
-        Returns:
-            output (List[Tuple[float, BasePipeline]]):
-                each model with the related weight, sorted by ascending
-                performance. Notice that ensemble selection solves a minimization
-                problem.
-        """
-        output = []
-        for i, weight in enumerate(self.weights_):
-            if weight > 0.0:
-                identifier = self.identifiers_[i]
-                model = models[identifier]
-                output.append((weight, model))
-
-        output.sort(reverse=True, key=lambda t: t[0])
-
-        return output
-
     def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
         """
         After training of ensemble selection, not all models will be used.
@@ -290,3 +217,12 @@ def get_validation_performance(self) -> float:
                 best ensemble training performance
         """
         return self.trajectory_[-1]
+
+    def predict_with_current_pipeline(
+        self,
+        pipeline_predictions: np.ndarray,
+    ) -> None:
+        # TODO: predict with ensemble by replacing model at j = self.iteration mod m,
+        # where m is ensemble_size.
+        # returns None
+        pass
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 39a96bbf6..e3f54a818 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -2,20 +2,23 @@
 import logging
 import logging.handlers
 import math
+from mmap import MADV_NOHUGEPAGE
 import os
 import pickle
+import re
 import time
 import traceback
 import zlib
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
+from numpy.random.mtrand import seed
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
 from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
-from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
 from autoPyTorch.utils.logging_ import get_named_client_logger
@@ -180,8 +183,17 @@ def main(
             time_left - used_time,
         )
 
-        # populates self.read_preds and self.read_losses
-        if not self.compute_loss_per_model():
+        ensemble_identifiers = None
+        # Get ensemble_identifiers from previous iteration.
+        ensemble_dir = self.backend.get_ensemble_dir()
+        if len(os.listdir(ensemble_dir)) >= 1:
+            old_ensemble = self.backend.load_ensemble(self.seed)
+            ensemble_identifiers = old_ensemble.ensemble_identifiers
+
+        self.ensemble_slot_j = np.mod(iteration, self.ensemble_size)
+
+        # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
+        if not self.compute_ensemble_loss_per_model(ensemble_identifiers=ensemble_identifiers):
             if return_predictions:
                 return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
             else:
@@ -196,7 +208,7 @@ def main(
             else:
                 return self.ensemble_history, self.ensemble_nbest, None, None
 
-        # populates predictions in self.read_preds
+        # populates test predictions in self.read_preds
         # reduces selected models if file reading failed
         n_sel_test = self.get_test_preds(selected_keys=candidate_models)
 
@@ -215,8 +227,19 @@ def main(
             for candidate in candidate_models:
                 self._has_been_candidate.add(candidate)
 
+        # as candidate models is sorted in `get_n_best_preds`
+        best_model_identifier = candidate_models[0]
+
+        # initialise ensemble_identifier with best_model_identifier
+        if ensemble_identifiers == None:
+            ensemble_identifiers = [best_model_identifier]
+
         # train ensemble
-        ensemble = self.fit_ensemble(selected_keys=candidate_models)
+        ensemble = self.fit_ensemble(
+            selected_keys=candidate_models,
+            ensemble_identifiers=ensemble_identifiers,
+            best_model_identifier=best_model_identifier
+            )
 
         # Save the ensemble for later use in the main module!
         if ensemble is not None and self.SAVE2DISC:
@@ -261,31 +284,11 @@ def main(
         else:
             return self.ensemble_history, self.ensemble_nbest, None, None
 
-    def get_disk_consumption(self, pred_path: str) -> float:
-        """
-        gets the cost of a model being on disc
-        """
-
-        match = self.model_fn_re.search(pred_path)
-        if not match:
-            raise ValueError("Invalid path format %s" % pred_path)
-        _seed = int(match.group(1))
-        _num_run = int(match.group(2))
-        _budget = float(match.group(3))
-
-        stored_files_for_run = os.listdir(
-            self.backend.get_numrun_directory(_seed, _num_run, _budget))
-        stored_files_for_run = [
-            os.path.join(self.backend.get_numrun_directory(_seed, _num_run, _budget), file_name)
-            for file_name in stored_files_for_run]
-        this_model_cost = sum([os.path.getsize(path) for path in stored_files_for_run])
-
-        # get the megabytes
-        return round(this_model_cost / math.pow(1024, 2), 2)
-
     # TODO: change this function, to compute loss according to Lavesque et al.
     # TODO: this will help us in choosing the model with the lowest ensemble error.
-    def compute_loss_per_model(self) -> bool:
+    # TODO: predictions on ensemble set will be available in read_preds to be used for
+    # TODO: passing to stacking_ensemble_builder.predict()
+    def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool:
         """
             Compute the loss of the predictions on ensemble building data set;
             populates self.read_preds and self.read_losses
@@ -374,12 +377,10 @@ def compute_loss_per_model(self) -> bool:
             # actually read the predictions and compute their respective loss
             try:
                 y_ensemble = self._read_np_fn(y_ens_fn)
-                losses = calculate_loss(
-                    metrics=self.metrics,
-                    target=self.y_true_ensemble,
-                    prediction=y_ensemble,
-                    task_type=self.task_type,
-                )
+                losses = self.get_ensemble_loss(
+                    ensemble_identifiers=ensemble_identifiers,
+                    model_predictions=y_ensemble
+                    )
 
                 if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]):
                     self.logger.debug(
@@ -478,7 +479,12 @@ def get_test_preds(self, selected_keys: List[str]) -> List[str]:
 
         return success_keys_test
 
-    def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
+    def fit_ensemble(
+        self,
+        selected_keys: List[str],
+        best_model_identifier,
+        ensemble_identifiers = None
+    ) -> Optional[StackingEnsemble]:
         """
             fit ensemble
 
@@ -489,21 +495,29 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
 
             Returns
             -------
-            ensemble: EnsembleSelection
+            ensemble: StackingEnsemble
                 trained Ensemble
         """
 
         if self.unit_test:
             raise MemoryError()
 
-        predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys]
-        include_num_runs = [
+        predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in ensemble_identifiers]
+        best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE]
+
+        ensemble_num_runs = [
             (
                 self.read_losses[k]["seed"],
                 self.read_losses[k]["num_run"],
                 self.read_losses[k]["budget"],
             )
-            for k in selected_keys]
+            for k in ensemble_identifiers]
+
+        best_model_num_run = (
+            self.read_losses[best_model_identifier]["seed"],
+            self.read_losses[best_model_identifier]["num_run"],
+            self.read_losses[best_model_identifier]["budget"],
+        )
 
         # check hash if ensemble training data changed
         current_hash = "".join([
@@ -525,7 +539,7 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
             raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} "
                              "as more than one unique optimization metric was found.")
 
-        ensemble = EnsembleSelection(
+        ensemble = StackingEnsemble(
             ensemble_size=self.ensemble_size,
             metric=opt_metric,
             random_state=self.random_state,
@@ -538,8 +552,12 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
                 len(predictions_train),
             )
             start_time = time.time()
-            ensemble.fit(predictions_train, self.y_true_ensemble,
-                         include_num_runs)
+            ensemble.fit(
+                predictions_train, 
+                best_model_predictions,
+                self.y_true_ensemble,
+                ensemble_num_runs,
+                best_model_num_run)
             end_time = time.time()
             self.logger.debug(
                 "Fitting the ensemble took %.2f seconds.",
@@ -616,4 +634,118 @@ def predict(self, set_: str,
                 n_preds,
                 set_,
             )
-            return None
\ No newline at end of file
+            return None
+
+    def get_n_best_preds(self) -> List[str]:
+        """
+            get best n predictions (i.e., keys of self.read_losses)
+            according to the loss on the "ensemble set"
+            n: all models
+
+            Side effects:
+                ->Define the n-best models to use in ensemble
+                ->Only the best models are loaded
+                ->Any model that is not best is candidate to deletion
+                  if max models in disc is exceeded.
+        """
+
+        sorted_keys = self._get_list_of_sorted_preds()
+
+        # number of models available
+        num_keys = len(sorted_keys)
+        # remove all that are at most as good as random
+        # note: dummy model must have run_id=1 (there is no run_id=0)
+        dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys))
+        # Leave this here for when we enable dummy classifier/scorer
+        if len(dummy_losses) > 0:
+            # number of dummy models
+            num_dummy = len(dummy_losses)
+            dummy_loss = dummy_losses[0]
+            self.logger.debug("Use %f as dummy loss" % dummy_loss[1])
+            sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys))
+
+            # remove Dummy Classifier
+            sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys))
+            if len(sorted_keys) == 0:
+                # no model left; try to use dummy loss (num_run==0)
+                # log warning when there are other models but not better than dummy model
+                if num_keys > num_dummy:
+                    self.logger.warning("No models better than random - using Dummy Score!"
+                                        "Number of models besides current dummy model: %d. "
+                                        "Number of dummy models: %d",
+                                        num_keys - 1,
+                                        num_dummy)
+                sorted_keys = [
+                    (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items()
+                    if v["seed"] == self.seed and v["num_run"] == 1
+                ]
+
+        # reduce to keys
+        reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys))
+
+        # Load the predictions for the winning
+        for k in reduced_sorted_keys:
+            if (
+                (
+                    k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None
+                )
+                and self.read_losses[k]['loaded'] != 3
+            ):
+                self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k)
+                # No need to load test here because they are loaded
+                #  only if the model ends up in the ensemble
+                self.read_losses[k]['loaded'] = 1
+        # return best scored keys of self.read_losses
+        return reduced_sorted_keys
+
+    def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions: np.ndarray):
+        """
+        Gets the loss of the ensemble given slot j and predictions for new model at slot j
+        set is ensemble
+        Args:
+            ensemble_identifiers ([type]): [description]
+            model_predictions ([type]): [description]
+        """
+
+
+        if ensemble_identifiers is None:
+            loss = calculate_loss(
+                    metrics=[self.metric],
+                    target=self.y_true_ensemble,
+                    prediction=model_predictions,
+                    task_type=self.task_type,
+                )
+        else:
+            weighted_ensemble_prediction = np.zeros(
+                model_predictions.shape,
+                dtype=np.float64,
+            )
+            fant_ensemble_prediction = np.zeros(
+                weighted_ensemble_prediction.shape,
+                dtype=np.float64,
+            )
+
+            
+            for i, identifier in enumerate(ensemble_identifiers):
+                if self.read_preds[identifier][Y_ENSEMBLE] == None:
+                    # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration.
+                    raise ValueError("check here to resolve starting condition")
+                predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions
+
+                np.add(
+                    weighted_ensemble_prediction,
+                    predictions,
+                    out=fant_ensemble_prediction
+                )
+                np.multiply(
+                    fant_ensemble_prediction,
+                    (1. / float(self.ensemble_size)),
+                    out=fant_ensemble_prediction
+                )
+            loss = calculate_loss(
+                        metrics=[self.metric],
+                        target=self.y_true_ensemble,
+                        prediction=fant_ensemble_prediction,
+                        task_type=self.task_type,
+                    )
+        return loss
\ No newline at end of file
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index bf842ecb9..3e1da6e5a 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -1,4 +1,6 @@
 from multiprocessing.queues import Queue
+import os
+import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
@@ -19,6 +21,7 @@
     AbstractEvaluator,
     fit_and_suppress_warnings
 )
+from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -138,9 +141,6 @@ def __init__(self, backend: Backend, queue: Queue,
         # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions.
         # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something
 
-        self.splits = self.datamanager.splits
-        if self.splits is None:
-            raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
         self.num_folds: int = len(self.splits)
         self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
         self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
@@ -150,6 +150,114 @@ def __init__(self, backend: Backend, queue: Queue,
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
         self.keep_models = keep_models
 
+    def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
+                  valid_pred: Optional[np.ndarray],
+                  test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
+                  file_output: bool, status: StatusType,
+                  ensemble_pred: Optional[np.ndarray],
+                  ) -> Optional[Tuple[float, float, int, Dict]]:
+        """This function does everything necessary after the fitting is done:
+        * predicting
+        * saving the necessary files
+        We use it as the signal handler so we can recycle the code for the
+        normal usecase and when the runsolver kills us here :)"""
+
+        self.duration = time.time() - self.starttime
+
+        if file_output:
+            loss_, additional_run_info_ = self.file_output(
+                None, valid_pred, test_pred,
+            )
+        else:
+            loss_ = None
+            additional_run_info_ = {}
+
+        validation_loss, test_loss = self.calculate_auxiliary_losses(
+            valid_pred, test_pred
+        )
+
+        if loss_ is not None:
+            return self.duration, loss_, self.seed, additional_run_info_
+
+        cost = loss[self.metric.name]
+
+        additional_run_info = (
+            {} if additional_run_info is None else additional_run_info
+        )
+        for metric_name, value in loss.items():
+            additional_run_info[metric_name] = value
+        additional_run_info['duration'] = self.duration
+        additional_run_info['num_run'] = self.num_run
+        if train_loss is not None:
+            additional_run_info['train_loss'] = train_loss
+        if validation_loss is not None:
+            additional_run_info['validation_loss'] = validation_loss
+        if test_loss is not None:
+            additional_run_info['test_loss'] = test_loss
+
+        rval_dict = {'loss': cost,
+                     'additional_run_info': additional_run_info,
+                     'status': status}
+
+        self.queue.put(rval_dict)
+        return None
+
+    def file_output(
+        self,
+        Y_optimization_pred: np.ndarray,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray,
+    ) -> Tuple[Optional[float], Dict]:
+
+        # Abort if predictions contain NaNs
+        for y, s in [
+            [Y_valid_pred, 'validation'],
+            [Y_test_pred, 'test']
+        ]:
+            if y is not None and not np.all(np.isfinite(y)):
+                return (
+                    1.0,
+                    {
+                        'error':
+                            'Model predictions for %s set contains NaNs.' % s
+                    },
+                )
+
+        # Abort if we don't want to output anything.
+        if hasattr(self, 'disable_file_output'):
+            if self.disable_file_output:
+                return None, {}
+            else:
+                self.disabled_file_outputs = []
+
+        if hasattr(self, 'pipeline') and self.pipeline is not None:
+            if 'pipeline' not in self.disabled_file_outputs:
+                pipeline = self.pipeline
+            else:
+                pipeline = None
+        else:
+            pipeline = None
+
+        self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget))
+        self.backend.save_numrun_to_dir(
+            seed=int(self.seed),
+            idx=int(self.num_run),
+            budget=float(self.budget),
+            model=pipeline,
+            cv_model=None,
+            ensemble_predictions=None,
+            valid_predictions=(
+                Y_valid_pred if 'y_valid' not in
+                                self.disabled_file_outputs else None
+            ),
+            test_predictions=(
+                Y_test_pred if 'y_test' not in
+                               self.disabled_file_outputs else None
+            ),
+        )
+
+        return None, {}
+
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
@@ -232,18 +340,23 @@ def _predict(self, pipeline: BaseEstimator,
                  train_indices: Union[np.ndarray, List]
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
         # TODO: load ensemble members and predict using the whole ensemble.
-
+        # TODO: we need some function to pass this pipeline to the last stored ensemble replace 
+        # TODO: model j, where j = ensemble.iteration mod m. then we need to predict
+        # TODO: Also, we will pass the predictions from this pipeline as that is what is needed
+        # TODO: to create the ensemble.
         train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
                                            self.y_train[train_indices])
 
-        opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
+        pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
                                          self.y_train[train_indices])
 
-        if self.X_valid is not None:
-            valid_pred = self.predict_function(self.X_valid, pipeline,
-                                               self.y_valid)
+        ensemble_dir = self.backend.get_ensemble_dir()
+        if len(os.listdir(ensemble_dir)) >= 1:
+            old_ensemble = self.backend.load_ensemble(self.seed)
+            assert isinstance(old_ensemble, StackingEnsemble)
+            ensemble_opt_pred = old_ensemble.predict_with_current_model()
         else:
-            valid_pred = None
+            ensemble_opt_pred = None
 
         if self.X_test is not None:
             test_pred = self.predict_function(self.X_test, pipeline,
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 0d82b9622..2d32dece0 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -46,7 +46,11 @@
                                 sklearn.metrics.balanced_accuracy_score)
 f1 = make_metric('f1',
                  sklearn.metrics.f1_score)
-
+zero_one_loss = make_metric('zero_one_loss',
+                            sklearn.metrics.zero_one_loss,
+                            optimum=0,
+                            greater_is_better=False,
+                            worst_possible_result=MAXINT)
 # Score functions that need decision values
 roc_auc = make_metric('roc_auc', sklearn.metrics.roc_auc_score, needs_threshold=True)
 average_precision = make_metric('average_precision',
@@ -73,7 +77,7 @@
 CLASSIFICATION_METRICS = dict()
 
 for scorer in [accuracy, balanced_accuracy, roc_auc, average_precision,
-               log_loss]:
+               log_loss, zero_one_loss]:
     CLASSIFICATION_METRICS[scorer.name] = scorer
 
 for name, metric in [('precision', sklearn.metrics.precision_score),
diff --git a/test/test_ensemble/test_ensemble.py b/test/test_ensemble/test_ensemble.py
index d8463ab86..6a2650313 100644
--- a/test/test_ensemble/test_ensemble.py
+++ b/test/test_ensemble/test_ensemble.py
@@ -18,9 +18,9 @@
 from smac.runhistory.runhistory import RunHistory, RunKey, RunValue
 
 from autoPyTorch.constants import BINARY, MULTICLASS, TABULAR_CLASSIFICATION
-from autoPyTorch.ensemble.ensemble_builder_manager import (
+from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
+from autoPyTorch.ensemble.ensemble_builder import (
     EnsembleBuilder,
-    EnsembleBuilderManager,
     Y_ENSEMBLE,
     Y_TEST,
 )

From 7ed9693051c8f499dba25c65c66bc16f150d5b61 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:34:13 +0200
Subject: [PATCH 03/16] revert deletion of get_n_best_preds (clean)

---
 autoPyTorch/ensemble/ensemble_builder.py | 161 ++++++++++++++++++++++-
 1 file changed, 160 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index 87ec9a2b9..ea2b77c97 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -678,7 +678,166 @@ def compute_loss_per_model(self) -> bool:
         )
         return True
 
-    
+    def get_n_best_preds(self) -> List[str]:
+        """
+            get best n predictions (i.e., keys of self.read_losses)
+            according to the loss on the "ensemble set"
+            n: self.ensemble_nbest
+            Side effects:
+                ->Define the n-best models to use in ensemble
+                ->Only the best models are loaded
+                ->Any model that is not best is candidate to deletion
+                  if max models in disc is exceeded.
+        """
+
+        sorted_keys = self._get_list_of_sorted_preds()
+
+        # number of models available
+        num_keys = len(sorted_keys)
+        # remove all that are at most as good as random
+        # note: dummy model must have run_id=1 (there is no run_id=0)
+        dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys))
+        # Leave this here for when we enable dummy classifier/scorer
+        if len(dummy_losses) > 0:
+            # number of dummy models
+            num_dummy = len(dummy_losses)
+            dummy_loss = dummy_losses[0]
+            self.logger.debug("Use %f as dummy loss" % dummy_loss[1])
+            sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys))
+
+            # remove Dummy Classifier
+            sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys))
+            if len(sorted_keys) == 0:
+                # no model left; try to use dummy loss (num_run==0)
+                # log warning when there are other models but not better than dummy model
+                if num_keys > num_dummy:
+                    self.logger.warning("No models better than random - using Dummy Score!"
+                                        "Number of models besides current dummy model: %d. "
+                                        "Number of dummy models: %d",
+                                        num_keys - 1,
+                                        num_dummy)
+                sorted_keys = [
+                    (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items()
+                    if v["seed"] == self.seed and v["num_run"] == 1
+                ]
+        # reload predictions if losses changed over time and a model is
+        # considered to be in the top models again!
+        if not isinstance(self.ensemble_nbest, numbers.Integral):
+            # Transform to number of models to keep. Keep at least one
+            keep_nbest = max(1, min(len(sorted_keys),
+                                    int(len(sorted_keys) * self.ensemble_nbest)))
+            self.logger.debug(
+                "Library pruning: using only top %f percent of the models for ensemble "
+                "(%d out of %d)",
+                self.ensemble_nbest * 100, keep_nbest, len(sorted_keys)
+            )
+        else:
+            # Keep only at most ensemble_nbest
+            keep_nbest = min(self.ensemble_nbest, len(sorted_keys))
+            self.logger.debug("Library Pruning: using for ensemble only "
+                              " %d (out of %d) models" % (keep_nbest, len(sorted_keys)))
+
+        # If max_models_on_disc is None, do nothing
+        # One can only read at most max_models_on_disc models
+        if self.max_models_on_disc is not None:
+            if not isinstance(self.max_models_on_disc, numbers.Integral):
+                consumption = [
+                    [
+                        v["ens_loss"],
+                        v["disc_space_cost_mb"],
+                    ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None
+                ]
+                max_consumption = max(c[1] for c in consumption)
+
+                # We are pessimistic with the consumption limit indicated by
+                # max_models_on_disc by 1 model. Such model is assumed to spend
+                # max_consumption megabytes
+                if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc:
+
+                    # just leave the best -- smaller is better!
+                    # This list is in descending order, to preserve the best models
+                    sorted_cum_consumption = np.cumsum([
+                        c[1] for c in list(sorted(consumption))
+                    ]) + max_consumption
+                    max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc)
+
+                    # Make sure that at least 1 model survives
+                    self.max_resident_models = max(1, max_models)
+                    self.logger.warning(
+                        "Limiting num of models via float max_models_on_disc={}"
+                        " as accumulated={} worst={} num_models={}".format(
+                            self.max_models_on_disc,
+                            (sum(c[1] for c in consumption) + max_consumption),
+                            max_consumption,
+                            self.max_resident_models
+                        )
+                    )
+                else:
+                    self.max_resident_models = None
+            else:
+                self.max_resident_models = self.max_models_on_disc
+
+        if self.max_resident_models is not None and keep_nbest > self.max_resident_models:
+            self.logger.debug(
+                "Restricting the number of models to %d instead of %d due to argument "
+                "max_models_on_disc",
+                self.max_resident_models, keep_nbest,
+            )
+            keep_nbest = self.max_resident_models
+
+        # consider performance_range_threshold
+        if self.performance_range_threshold > 0:
+            best_loss = sorted_keys[0][1]
+            worst_loss = dummy_loss[1]
+            worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold
+            if sorted_keys[keep_nbest - 1][1] > worst_loss:
+                # We can further reduce number of models
+                # since worst model is worse than thresh
+                for i in range(0, keep_nbest):
+                    # Look at most at keep_nbest models,
+                    # but always keep at least one model
+                    current_loss = sorted_keys[i][1]
+                    if current_loss >= worst_loss:
+                        self.logger.debug("Dynamic Performance range: "
+                                          "Further reduce from %d to %d models",
+                                          keep_nbest, max(1, i))
+                        keep_nbest = max(1, i)
+                        break
+        ensemble_n_best = keep_nbest
+
+        # reduce to keys
+        reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys))
+
+        # remove loaded predictions for non-winning models
+        for k in reduced_sorted_keys[ensemble_n_best:]:
+            if k in self.read_preds:
+                self.read_preds[k][Y_ENSEMBLE] = None
+                self.read_preds[k][Y_TEST] = None
+            if self.read_losses[k]['loaded'] == 1:
+                self.logger.debug(
+                    'Dropping model %s (%d,%d) with loss %f.',
+                    k,
+                    self.read_losses[k]['seed'],
+                    self.read_losses[k]['num_run'],
+                    self.read_losses[k]['ens_loss'],
+                )
+                self.read_losses[k]['loaded'] = 2
+
+        # Load the predictions for the winning
+        for k in reduced_sorted_keys[:ensemble_n_best]:
+            if (
+                (
+                    k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None
+                )
+                and self.read_losses[k]['loaded'] != 3
+            ):
+                self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k)
+                # No need to load test here because they are loaded
+                #  only if the model ends up in the ensemble
+                self.read_losses[k]['loaded'] = 1
+
+        # return best scored keys of self.read_losses
+        return reduced_sorted_keys[:ensemble_n_best]
 
     def get_test_preds(self, selected_keys: List[str]) -> List[str]:
         """

From 056e08857781eb7cf0e027c0bf7fa950e3b97f47 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:37:22 +0200
Subject: [PATCH 04/16] made changes to ensemble building to solve persistency
 issue of ensemble (clean)

---
 autoPyTorch/ensemble/stacking_ensemble.py     | 136 ++++++------
 .../ensemble/stacking_ensemble_builder.py     | 205 +++++++-----------
 2 files changed, 156 insertions(+), 185 deletions(-)

diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
index 4d6987eb1..f0621c29b 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -22,16 +22,13 @@ def __init__(
         metric: autoPyTorchMetric,
         task_type: int,
         random_state: np.random.RandomState,
-        # should be with something like numrun_seed_budget.
-        ensemble_identifiers = None,
-        best_model_identifier = None,
-        ensemble_slot_j: int = None,
-        read_preds = None,
+        ensemble_slot_j: int
     ) -> None:
         self.ensemble_size = ensemble_size
         self.metric = metric
         self.random_state = random_state
         self.task_type = task_type
+        self.ensemble_slot_j = ensemble_slot_j
 
     def __getstate__(self) -> Dict[str, Any]:
         # Cannot serialize a metric if
@@ -50,7 +47,7 @@ def fit(
         best_model_predictions: np.ndarray,
         labels: np.ndarray,
         ensemble_identifiers: List[Tuple[int, int, float]],
-        best_model_identifier: Tuple[int, int, float]
+        best_model_identifier: Tuple[int, int, float],
     ) -> AbstractEnsemble:
         """
         Builds a ensemble given the individual models out of fold predictions.
@@ -70,7 +67,11 @@ def fit(
         Returns:
             A copy of self
         """
-
+        predictions_ensemble[self.ensemble_slot_j] = best_model_predictions
+        ensemble_identifiers[self.ensemble_slot_j] = best_model_identifier
+        self._fit(predictions_ensemble, labels)
+        self.identifiers_ = ensemble_identifiers
+        self._calculate_weights()
         return self
 
     # TODO: fit a stacked ensemble.
@@ -93,37 +94,43 @@ def _fit(
                 A list of model identifiers, each with the form
                 (seed, number of run, budget)
         """
-        self.num_input_models_ = len(predictions)
-
-        ensemble: List[np.ndarray] = []
-        trajectory = []
-        order = []
-
-        ensemble_size = self.ensemble_size
 
         weighted_ensemble_prediction = np.zeros(
             predictions[0].shape,
             dtype=np.float64,
         )
 
-        # Calculate loss is versatile and can return a dict of slosses
-        # losses[j] = calculate_loss(
-        #     metrics=[self.metric],
-        #     target=labels,
-        #     prediction=fant_ensemble_prediction,
-        #     task_type=self.task_type,
-        # )[self.metric.name]
+        fant_ensemble_prediction = np.zeros(
+            weighted_ensemble_prediction.shape,
+            dtype=np.float64,
+        )
 
-        # all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
-        # best = self.random_state.choice(all_best)
-        # ensemble.append(predictions[best])
-        # trajectory.append(losses[best])
-        # order.append(best)
+        nonnull_predictions = [pred for pred in predictions if pred is not None]
+        size = len(nonnull_predictions)
+        for pred in nonnull_predictions:
+            np.add(
+                weighted_ensemble_prediction,
+                pred,
+                out=fant_ensemble_prediction
+            )
+            np.multiply(
+                fant_ensemble_prediction,
+                (1. / float(size)),
+                out=fant_ensemble_prediction
+            )
+        
+        # Calculate loss is versatile and can return a dict of slosses
+        loss = calculate_loss(
+            metrics=[self.metric],
+            target=labels,
+            prediction=fant_ensemble_prediction,
+            task_type=self.task_type,
+        )[self.metric.name]
 
+        # store list of preds for later use
+        self.ensemble_predictions = predictions
 
-        self.indices_: List[int] = order
-        self.trajectory_: List[float] = trajectory
-        self.train_loss_: float = trajectory[-1]
+        self.train_loss_: float = loss
 
     # TODO: return 1 for models in layer 0, 2 for next and so on
     # TODO: 0 for models that are not in stack
@@ -134,19 +141,22 @@ def _calculate_weights(self) -> None:
         a frequency counting scheme. In particular, how many times a model
         was used during hill climbing optimization.
         """
-        ensemble_members = Counter(self.indices_).most_common()
         weights = np.zeros(
-            (self.num_input_models_,),
+            self.ensemble_size,
             dtype=np.float64,
         )
-        for ensemble_member in ensemble_members:
-            weight = 1
-            weights[ensemble_member[0]] = weight
+        current_size = len([id for id in self.identifiers_ if id is not None])
+        for i, identifier in enumerate(self.identifiers_):
+            if identifier is not None:
+                weights[i] = (1. / float(current_size))
 
         self.weights_ = weights
 
     # TODO: Adjust this to use weights and make 
-    def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
+    def predict(self, predictions: List[np.ndarray]) -> np.ndarray:
+        return self._predict(predictions, self.weights_)
+
+    def _predict(self, predictions, weights):
         """
         Given a list of predictions from the individual model, this method
         aggregates the predictions using a soft voting scheme with the weights
@@ -158,7 +168,7 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
 
         Returns:
             average (np.ndarray): Soft voting predictions of ensemble models, using
-                                the weights found during ensemble selection (self._weights)
+                                the weights
         """
 
         average = np.zeros_like(predictions[0], dtype=np.float64)
@@ -166,8 +176,8 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
 
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
-        if len(predictions) == np.count_nonzero(self.weights_):
-            non_null_weights = [w for w in self.weights_ if w > 0]
+        if len(predictions) == np.count_nonzero(weights):
+            non_null_weights = [w for w in weights if w > 0]
             for pred, weight in zip(predictions, non_null_weights):
                 np.multiply(pred, weight, out=tmp_predictions)
                 np.add(average, tmp_predictions, out=average)
@@ -179,15 +189,15 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
         del tmp_predictions
         return average
 
-    def __str__(self) -> str:
-        return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
-               '\n\tWeights: %s\n\tIdentifiers: %s' % \
-               (' '.join(['%d: %5f' % (idx, performance)
-                         for idx, performance in enumerate(self.trajectory_)]),
-                self.indices_, self.weights_,
-                ' '.join([str(identifier) for idx, identifier in
-                          enumerate(self.identifiers_)
-                          if self.weights_[idx] > 0]))
+    # def __str__(self) -> str:
+    #     return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
+    #            '\n\tWeights: %s\n\tIdentifiers: %s' % \
+    #            (' '.join(['%d: %5f' % (idx, performance)
+    #                      for idx, performance in enumerate(self.trajectory_)]),
+    #             self.indices_, self.weights_,
+    #             ' '.join([str(identifier) for idx, identifier in
+    #                       enumerate(self.identifiers_)
+    #                       if self.weights_[idx] > 0]))
 
     def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
         """
@@ -199,14 +209,7 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
             output (List[Tuple[int, int, float]]):
                 The models actually used by ensemble selection
         """
-        output = []
-
-        for i, weight in enumerate(self.weights_):
-            identifier = self.identifiers_[i]
-            if weight > 0.0:
-                output.append(identifier)
-
-        return output
+        return self.identifiers_
 
     def get_validation_performance(self) -> float:
         """
@@ -216,13 +219,24 @@ def get_validation_performance(self) -> float:
             (float):
                 best ensemble training performance
         """
-        return self.trajectory_[-1]
+        return self.train_loss_
 
     def predict_with_current_pipeline(
         self,
         pipeline_predictions: np.ndarray,
-    ) -> None:
-        # TODO: predict with ensemble by replacing model at j = self.iteration mod m,
-        # where m is ensemble_size.
-        # returns None
-        pass
+    ) -> np.ndarray:
+        """
+        predict with ensemble by replacing model at j = self.iteration mod m,
+        where m is ensemble_size.
+        returns ensemble predictions
+        """
+        predictions = self.ensemble_predictions.copy()
+        if predictions[self.ensemble_slot_j] is None:
+            total_predictions = len([pred for pred in predictions if pred is not None])
+            total_predictions += 1
+            weights = [1/total_predictions for pred in predictions if pred is not None]
+        else:
+            weights = self.weights_
+
+        predictions[self.ensemble_slot_j] = pipeline_predictions
+        return self._predict(predictions, weights)
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index e3f54a818..836f7884c 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -1,8 +1,6 @@
 import glob
 import logging
 import logging.handlers
-import math
-from mmap import MADV_NOHUGEPAGE
 import os
 import pickle
 import re
@@ -12,7 +10,6 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
-from numpy.random.mtrand import seed
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
@@ -20,7 +17,7 @@
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
 from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score
 from autoPyTorch.utils.logging_ import get_named_client_logger
 
 Y_ENSEMBLE = 0
@@ -115,6 +112,11 @@ def __init__(
             seed=seed, precision=precision, memory_limit=memory_limit,
             read_at_most=read_at_most, random_state=random_state,
             logger_port=logger_port, unit_test=unit_test)
+        # we still need to store ensemble identifiers as this class is not persistant
+        # we can do this by either storing and reading them in this class
+        # or passing them via the ensemble builder manager which has persistency with the futures stored.
+        self.ensemble_identifiers: Optional[List[Optional[str]]] = None
+
 
     # TODO: This is the main wrapper to the EnsembleSelection class which fits
     # TODO: the ensemble
@@ -183,17 +185,10 @@ def main(
             time_left - used_time,
         )
 
-        ensemble_identifiers = None
-        # Get ensemble_identifiers from previous iteration.
-        ensemble_dir = self.backend.get_ensemble_dir()
-        if len(os.listdir(ensemble_dir)) >= 1:
-            old_ensemble = self.backend.load_ensemble(self.seed)
-            ensemble_identifiers = old_ensemble.ensemble_identifiers
-
         self.ensemble_slot_j = np.mod(iteration, self.ensemble_size)
-
+        self.ensemble_identifiers = self._load_ensemble_identifiers()
         # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
-        if not self.compute_ensemble_loss_per_model(ensemble_identifiers=ensemble_identifiers):
+        if not self.compute_ensemble_loss_per_model():
             if return_predictions:
                 return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
             else:
@@ -201,7 +196,7 @@ def main(
 
         # Only the models with the n_best predictions are candidates
         # to be in the ensemble
-        candidate_models = self.get_n_best_preds()
+        candidate_models = self.get_candidate_preds()
         if not candidate_models:  # no candidates yet
             if return_predictions:
                 return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
@@ -230,21 +225,15 @@ def main(
         # as candidate models is sorted in `get_n_best_preds`
         best_model_identifier = candidate_models[0]
 
-        # initialise ensemble_identifier with best_model_identifier
-        if ensemble_identifiers == None:
-            ensemble_identifiers = [best_model_identifier]
-
         # train ensemble
         ensemble = self.fit_ensemble(
-            selected_keys=candidate_models,
-            ensemble_identifiers=ensemble_identifiers,
             best_model_identifier=best_model_identifier
             )
 
         # Save the ensemble for later use in the main module!
         if ensemble is not None and self.SAVE2DISC:
             self.backend.save_ensemble(ensemble, iteration, self.seed)
-
+            self._save_ensemble_identifiers(ensemble_identifiers=ensemble.identifiers_)
         # Delete files of non-candidate models - can only be done after fitting the ensemble and
         # saving it to disc so we do not accidentally delete models in the previous ensemble
         if self.max_resident_models is not None:
@@ -257,15 +246,15 @@ def main(
         if ensemble is not None:
             train_pred = self.predict(set_="train",
                                       ensemble=ensemble,
-                                      selected_keys=candidate_models,
-                                      n_preds=len(candidate_models),
+                                      selected_keys=ensemble.identifiers_,
+                                      n_preds=len(ensemble.identifiers_),
                                       index_run=iteration)
             # TODO if predictions fails, build the model again during the
             #  next iteration!
             test_pred = self.predict(set_="test",
                                      ensemble=ensemble,
-                                     selected_keys=n_sel_test,
-                                     n_preds=len(candidate_models),
+                                     selected_keys=ensemble.identifiers_,
+                                      n_preds=len(ensemble.identifiers_),
                                      index_run=iteration)
 
             # Add a score to run history to see ensemble progress
@@ -288,7 +277,7 @@ def main(
     # TODO: this will help us in choosing the model with the lowest ensemble error.
     # TODO: predictions on ensemble set will be available in read_preds to be used for
     # TODO: passing to stacking_ensemble_builder.predict()
-    def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool:
+    def compute_ensemble_loss_per_model(self) -> bool:
         """
             Compute the loss of the predictions on ensemble building data set;
             populates self.read_preds and self.read_losses
@@ -377,8 +366,7 @@ def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool:
             # actually read the predictions and compute their respective loss
             try:
                 y_ensemble = self._read_np_fn(y_ens_fn)
-                losses = self.get_ensemble_loss(
-                    ensemble_identifiers=ensemble_identifiers,
+                losses = self.get_ensemble_loss_with_model(
                     model_predictions=y_ensemble
                     )
 
@@ -421,69 +409,9 @@ def compute_ensemble_loss_per_model(self, ensemble_identifiers) -> bool:
         )
         return True
 
-    def get_test_preds(self, selected_keys: List[str]) -> List[str]:
-        """
-        test predictions from disc
-        and store them in self.read_preds
-        Parameters
-        ---------
-        selected_keys: list
-            list of selected keys of self.read_preds
-        Return
-        ------
-        success_keys:
-            all keys in selected keys for which we could read the valid and
-            test predictions
-        """
-        success_keys_test = []
-
-        for k in selected_keys:
-            test_fn = glob.glob(
-                os.path.join(
-                    glob.escape(self.backend.get_runs_directory()),
-                    '%d_%d_%s' % (
-                        self.read_losses[k]["seed"],
-                        self.read_losses[k]["num_run"],
-                        self.read_losses[k]["budget"],
-                    ),
-                    'predictions_test_%d_%d_%s.npy*' % (
-                        self.read_losses[k]["seed"],
-                        self.read_losses[k]["num_run"],
-                        self.read_losses[k]["budget"]
-                    )
-                )
-            )
-            test_fn = [tfn for tfn in test_fn if tfn.endswith('.npy') or tfn.endswith('.npy.gz')]
-
-            if len(test_fn) == 0:
-                # self.logger.debug("Not found test prediction file (although "
-                #                   "ensemble predictions available):%s" %
-                #                   test_fn)
-                pass
-            else:
-                if (
-                    self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn[0])
-                    and k in self.read_preds
-                    and self.read_preds[k][Y_TEST] is not None
-                ):
-                    success_keys_test.append(k)
-                    continue
-                try:
-                    y_test = self._read_np_fn(test_fn[0])
-                    self.read_preds[k][Y_TEST] = y_test
-                    success_keys_test.append(k)
-                    self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn[0])
-                except Exception:
-                    self.logger.warning('Error loading %s: %s',
-                                        test_fn, traceback.format_exc())
-
-        return success_keys_test
-
     def fit_ensemble(
         self,
-        selected_keys: List[str],
-        best_model_identifier,
-        ensemble_identifiers = None
+        best_model_identifier: str,
     ) -> Optional[StackingEnsemble]:
         """
             fit ensemble
@@ -499,10 +427,12 @@ def fit_ensemble(
                 trained Ensemble
         """
 
+        assert self.ensemble_identifiers is not None
+
         if self.unit_test:
             raise MemoryError()
 
-        predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in ensemble_identifiers]
+        predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.ensemble_identifiers]
         best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE]
 
         ensemble_num_runs = [
@@ -511,7 +441,8 @@ def fit_ensemble(
                 self.read_losses[k]["num_run"],
                 self.read_losses[k]["budget"],
             )
-            for k in ensemble_identifiers]
+            if k is not None else None
+            for k in self.ensemble_identifiers]
 
         best_model_num_run = (
             self.read_losses[best_model_identifier]["seed"],
@@ -544,6 +475,7 @@ def fit_ensemble(
             metric=opt_metric,
             random_state=self.random_state,
             task_type=self.task_type,
+            ensemble_slot_j=self.ensemble_slot_j
         )
 
         try:
@@ -557,7 +489,9 @@ def fit_ensemble(
                 best_model_predictions,
                 self.y_true_ensemble,
                 ensemble_num_runs,
-                best_model_num_run)
+                best_model_num_run
+                )
+
             end_time = time.time()
             self.logger.debug(
                 "Fitting the ensemble took %.2f seconds.",
@@ -611,7 +545,8 @@ def predict(self, set_: str,
             pred_set = Y_TEST
         else:
             pred_set = Y_ENSEMBLE
-        predictions = [self.read_preds[k][pred_set] for k in selected_keys]
+
+        predictions = [self.read_preds[k][pred_set] for k in selected_keys if k is not None]
 
         if n_preds == len(predictions):
             y = ensemble.predict(predictions)
@@ -636,11 +571,11 @@ def predict(self, set_: str,
             )
             return None
 
-    def get_n_best_preds(self) -> List[str]:
+    def get_candidate_preds(self) -> List[str]:
         """
-            get best n predictions (i.e., keys of self.read_losses)
+            gets predictions better than dummy score
+            (i.e., keys of self.read_losses)
             according to the loss on the "ensemble set"
-            n: all models
 
             Side effects:
                 ->Define the n-best models to use in ensemble
@@ -698,35 +633,30 @@ def get_n_best_preds(self) -> List[str]:
         # return best scored keys of self.read_losses
         return reduced_sorted_keys
 
-    def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions: np.ndarray):
+    def get_ensemble_loss_with_model(self, model_predictions: np.ndarray):
         """
         Gets the loss of the ensemble given slot j and predictions for new model at slot j
         set is ensemble
         Args:
-            ensemble_identifiers ([type]): [description]
             model_predictions ([type]): [description]
         """
 
+        weighted_ensemble_prediction = np.zeros(
+            model_predictions.shape,
+            dtype=np.float64,
+        )
+        fant_ensemble_prediction = np.zeros(
+            weighted_ensemble_prediction.shape,
+            dtype=np.float64,
+        )
 
-        if ensemble_identifiers is None:
-            loss = calculate_loss(
-                    metrics=[self.metric],
-                    target=self.y_true_ensemble,
-                    prediction=model_predictions,
-                    task_type=self.task_type,
-                )
-        else:
-            weighted_ensemble_prediction = np.zeros(
-                model_predictions.shape,
-                dtype=np.float64,
-            )
-            fant_ensemble_prediction = np.zeros(
-                weighted_ensemble_prediction.shape,
-                dtype=np.float64,
-            )
-
-            
-            for i, identifier in enumerate(ensemble_identifiers):
+        for i, identifier in enumerate(self.ensemble_identifiers):
+            if identifier is None:
+                if i == self.ensemble_slot_j:
+                    predictions = model_predictions
+                else:
+                    continue
+            else:
                 if self.read_preds[identifier][Y_ENSEMBLE] == None:
                     # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration.
                     raise ValueError("check here to resolve starting condition")
@@ -742,10 +672,37 @@ def get_ensemble_loss(self, ensemble_identifiers: List[str], model_predictions:
                     (1. / float(self.ensemble_size)),
                     out=fant_ensemble_prediction
                 )
-            loss = calculate_loss(
-                        metrics=[self.metric],
-                        target=self.y_true_ensemble,
-                        prediction=fant_ensemble_prediction,
-                        task_type=self.task_type,
-                    )
-        return loss
\ No newline at end of file
+        loss = calculate_loss(
+                metrics=[self.metric],
+                target=self.y_true_ensemble,
+                prediction=fant_ensemble_prediction,
+                task_type=self.task_type,
+            )
+        return loss
+
+    def _get_ensemble_identifiers_filename(self):
+        return os.path.join(self.backend.temporary_directory, 'ensemble_identifiers.pkl')
+
+    def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None:
+        with open(self._get_ensemble_identifiers_filename(), "wb") as file:
+            pickle.dump(ensemble_identifiers, file=file)
+    
+    def _load_ensemble_identifiers(self) -> List[Optional[str]]:
+        if os.path.exists(self._get_ensemble_identifiers_filename()):
+            with open(self._get_ensemble_identifiers_filename(), "rb") as file:
+                identifiers = pickle.load(file)
+        else:
+            identifiers = [None]*self.ensemble_size
+        return identifiers
+
+    def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]:
+        identifiers: List[Optional[str]] = []
+        for num_run in num_runs:
+            identifier = None
+            if num_run is not None:
+                seed, idx, budget = num_run
+                identifier = self.backend.get_prediction_filename(subset, seed, idx, budget)
+
+            identifiers.append(identifier)
+        return identifiers
+

From 8fc9b7e1347357ce4846c8996529aa71fccdd9cc Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:48:57 +0200
Subject: [PATCH 05/16] make stacking evaluator changes, not tested yet (clean)

---
 autoPyTorch/evaluation/stacking_evaluator.py | 83 ++++++++++++++------
 1 file changed, 61 insertions(+), 22 deletions(-)

diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index 3e1da6e5a..154db1546 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -152,9 +152,11 @@ def __init__(self, backend: Backend, queue: Queue,
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   valid_pred: Optional[np.ndarray],
-                  test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
-                  file_output: bool, status: StatusType,
-                  ensemble_pred: Optional[np.ndarray],
+                  test_pred: Optional[np.ndarray],
+                  pipeline_opt_pred: np.ndarray,
+                  ensemble_opt_pred: np.ndarray,
+                  additional_run_info: Optional[Dict],
+                  file_output: bool, status: StatusType,   
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
         * predicting
@@ -166,7 +168,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
 
         if file_output:
             loss_, additional_run_info_ = self.file_output(
-                None, valid_pred, test_pred,
+                ensemble_opt_pred, valid_pred, test_pred
             )
         else:
             loss_ = None
@@ -176,6 +178,10 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             valid_pred, test_pred
         )
 
+        pipeline_loss, _ = self.calculate_auxiliary_losses(
+            pipeline_opt_pred, None
+        )
+
         if loss_ is not None:
             return self.duration, loss_, self.seed, additional_run_info_
 
@@ -188,6 +194,8 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             additional_run_info[metric_name] = value
         additional_run_info['duration'] = self.duration
         additional_run_info['num_run'] = self.num_run
+        if pipeline_loss is not None:
+            additional_run_info['pipeline_loss'] = pipeline_loss
         if train_loss is not None:
             additional_run_info['train_loss'] = train_loss
         if validation_loss is not None:
@@ -209,8 +217,22 @@ def file_output(
         Y_test_pred: np.ndarray,
     ) -> Tuple[Optional[float], Dict]:
 
+        # Abort in case of shape misalignment
+        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
+            return (
+                1.0,
+                {
+                    'error':
+                        "Targets %s and prediction %s don't have "
+                        "the same length. Probably training didn't "
+                        "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape)
+                },
+            )
+
         # Abort if predictions contain NaNs
         for y, s in [
+            # Y_train_pred deleted here. Fix unittest accordingly.
+            [Y_optimization_pred, 'optimization'],
             [Y_valid_pred, 'validation'],
             [Y_test_pred, 'test']
         ]:
@@ -230,6 +252,11 @@ def file_output(
             else:
                 self.disabled_file_outputs = []
 
+        # This file can be written independently of the others down bellow
+        if 'y_optimization' not in self.disabled_file_outputs:
+            if self.output_y_hat_optimization:
+                self.backend.save_targets_ensemble(self.Y_optimization)
+
         if hasattr(self, 'pipeline') and self.pipeline is not None:
             if 'pipeline' not in self.disabled_file_outputs:
                 pipeline = self.pipeline
@@ -245,7 +272,10 @@ def file_output(
             budget=float(self.budget),
             model=pipeline,
             cv_model=None,
-            ensemble_predictions=None,
+            ensemble_predictions=(
+                Y_optimization_pred if 'y_optimization' not in
+                                       self.disabled_file_outputs else None
+            ),
             valid_predictions=(
                 Y_valid_pred if 'y_valid' not in
                                 self.disabled_file_outputs else None
@@ -272,12 +302,19 @@ def fit_predict_and_loss(self) -> None:
         train_split, test_split = self.splits[split_id]
         self.Y_optimization = self.y_train[test_split]
         self.Y_actual_train = self.y_train[train_split]
-        y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
-                                                                                    train_indices=train_split,
-                                                                                    test_indices=test_split,
-                                                                                    add_pipeline_to_self=True)
+        (
+            y_train_pred,
+            y_pipeline_opt_pred,
+            y_ensemble_opt_pred,
+            y_valid_pred,
+            y_test_pred
+        ) = self._fit_and_predict(pipeline, split_id,
+                                  train_indices=train_split,
+                                  test_indices=test_split,
+                                  add_pipeline_to_self=True)
+
         train_loss = self._loss(self.y_train[train_split], y_train_pred)
-        loss = self._loss(self.y_train[test_split], y_opt_pred)
+        loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred)
 
         additional_run_info = pipeline.get_additional_run_info() if hasattr(
             pipeline, 'get_additional_run_info') else {}
@@ -292,12 +329,13 @@ def fit_predict_and_loss(self) -> None:
         self.finish_up(
             loss=loss,
             train_loss=train_loss,
-            opt_pred=y_opt_pred,
+            ensemble_opt_pred=y_ensemble_opt_pred,
             valid_pred=y_valid_pred,
             test_pred=y_test_pred,
             additional_run_info=additional_run_info,
             file_output=True,
             status=status,
+            pipeline_opt_pred=y_pipeline_opt_pred
         )
 
 
@@ -317,12 +355,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
         y = None
         fit_and_suppress_warnings(self.logger, pipeline, X, y)
         self.logger.info("Model fitted, now predicting")
-        (
-            Y_train_pred,
-            Y_opt_pred,
-            Y_valid_pred,
-            Y_test_pred
-        ) = self._predict(
+        Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred = self._predict(
             pipeline,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -333,12 +366,12 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
         else:
             self.pipelines[fold] = pipeline
 
-        return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
+        return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred
 
     def _predict(self, pipeline: BaseEstimator,
                  test_indices: Union[np.ndarray, List],
                  train_indices: Union[np.ndarray, List]
-                 ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+                 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
         # TODO: load ensemble members and predict using the whole ensemble.
         # TODO: we need some function to pass this pipeline to the last stored ensemble replace 
         # TODO: model j, where j = ensemble.iteration mod m. then we need to predict
@@ -354,9 +387,15 @@ def _predict(self, pipeline: BaseEstimator,
         if len(os.listdir(ensemble_dir)) >= 1:
             old_ensemble = self.backend.load_ensemble(self.seed)
             assert isinstance(old_ensemble, StackingEnsemble)
-            ensemble_opt_pred = old_ensemble.predict_with_current_model()
+            ensemble_opt_pred = old_ensemble.predict_with_current_model(pipeline_opt_pred)
+        else:
+            ensemble_opt_pred = pipeline_opt_pred.copy()
+
+        if self.X_valid is not None:
+            valid_pred = self.predict_function(self.X_valid, pipeline,
+                                               self.y_valid)
         else:
-            ensemble_opt_pred = None
+            valid_pred = None
 
         if self.X_test is not None:
             test_pred = self.predict_function(self.X_test, pipeline,
@@ -364,7 +403,7 @@ def _predict(self, pipeline: BaseEstimator,
         else:
             test_pred = None
 
-        return train_pred, opt_pred, valid_pred, test_pred
+        return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred
 
 
 # create closure for evaluating an algorithm

From 8b931dfda19595cf9b34c364f83d981517c1501d Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 14:52:45 +0200
Subject: [PATCH 06/16] cleanup of stacking evaluator (clean)

---
 autoPyTorch/evaluation/stacking_evaluator.py | 28 +++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index 154db1546..eeca7b6e4 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -114,7 +114,6 @@ def __init__(self, backend: Backend, queue: Queue,
                  disable_file_output: Union[bool, List] = False,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
-                 keep_models: Optional[bool] = None,
                  all_supported_metrics: bool = True,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
         super().__init__(
@@ -141,14 +140,7 @@ def __init__(self, backend: Backend, queue: Queue,
         # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions.
         # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something
 
-        self.num_folds: int = len(self.splits)
-        self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
-        self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
-        self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
-        self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds
-
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
-        self.keep_models = keep_models
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   valid_pred: Optional[np.ndarray],
@@ -310,8 +302,7 @@ def fit_predict_and_loss(self) -> None:
             y_test_pred
         ) = self._fit_and_predict(pipeline, split_id,
                                   train_indices=train_split,
-                                  test_indices=test_split,
-                                  add_pipeline_to_self=True)
+                                  test_indices=test_split)
 
         train_loss = self._loss(self.y_train[train_split], y_train_pred)
         loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred)
@@ -339,11 +330,13 @@ def fit_predict_and_loss(self) -> None:
         )
 
 
-    def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List],
-                         test_indices: Union[np.ndarray, List],
-                         add_pipeline_to_self: bool
-                         ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
-        self.indices[fold] = ((train_indices, test_indices))
+    def _fit_and_predict(
+        self,
+        pipeline: BaseEstimator,
+        fold: int,
+        train_indices: Union[np.ndarray, List],
+        test_indices: Union[np.ndarray, List],
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
 
         # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
         # about fit_dictionary
@@ -361,10 +354,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
             test_indices=test_indices,
         )
 
-        if add_pipeline_to_self:
-            self.pipeline = pipeline
-        else:
-            self.pipelines[fold] = pipeline
+        self.pipeline = pipeline
 
         return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred
 

From f39e482be5ff25818435fadbb88dd0123fea5047 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 15:00:53 +0200
Subject: [PATCH 07/16] added arg to pass ensemble_method from api (clean)

---
 autoPyTorch/api/base_task.py                     | 10 +++++++++-
 autoPyTorch/api/tabular_classification.py        |  3 +++
 autoPyTorch/api/tabular_regression.py            |  3 +++
 autoPyTorch/ensemble/ensemble_builder_manager.py |  8 +++++++-
 autoPyTorch/ensemble/utils.py                    | 16 ++++++++++++++++
 5 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 autoPyTorch/ensemble/utils.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 514af72d2..cb00e9d73 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -49,6 +49,7 @@
 )
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
@@ -171,6 +172,7 @@ def __init__(
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
+        ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -193,6 +195,7 @@ def __init__(
         self.n_threads = n_threads
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
+        self.ensemble_method = ensemble_method
         self.max_models_on_disc = max_models_on_disc
         self.logging_config: Optional[Dict] = logging_config
         self.include_components: Optional[Dict] = include_components
@@ -1249,7 +1252,8 @@ def _search(
                                                         ensemble_size=self.ensemble_size,
                                                         ensemble_nbest=self.ensemble_nbest,
                                                         precision=precision,
-                                                        optimize_metric=self.opt_metric
+                                                        optimize_metric=self.opt_metric,
+                                                        ensemble_method=self.ensemble_method
                                                         )
             self._stopwatch.stop_task(ensemble_task_name)
 
@@ -1705,6 +1709,7 @@ def fit_ensemble(
             precision: Optional[int] = None,
             ensemble_nbest: int = 50,
             ensemble_size: int = 50,
+            ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
             load_models: bool = True,
             time_for_task: int = 100,
             func_eval_time_limit_secs: int = 50,
@@ -1815,6 +1820,7 @@ def fit_ensemble(
             precision=precision,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            ensemble_method=ensemble_method,
         )
 
         manager.build_ensemble(self._dask_client)
@@ -1834,6 +1840,7 @@ def _init_ensemble_builder(
             self,
             time_left_for_ensembles: float,
             optimize_metric: str,
+            ensemble_method: int,
             ensemble_nbest: int,
             ensemble_size: int,
             precision: int = 32,
@@ -1887,6 +1894,7 @@ def _init_ensemble_builder(
             opt_metric=optimize_metric,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            ensemble_method=ensemble_method,
             max_models_on_disc=self.max_models_on_disc,
             seed=self.seed,
             max_iterations=None,
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 3d80a0338..da1cf293b 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -22,6 +22,7 @@
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -87,6 +88,7 @@ def __init__(
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
+        ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -106,6 +108,7 @@ def __init__(
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            ensemble_method=ensemble_method,
             max_models_on_disc=max_models_on_disc,
             temporary_directory=temporary_directory,
             output_directory=output_directory,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index 073b4d77c..c9f21e453 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -22,6 +22,7 @@
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -87,6 +88,7 @@ def __init__(
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
+        ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -106,6 +108,7 @@ def __init__(
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            ensemble_method=ensemble_method,
             max_models_on_disc=max_models_on_disc,
             temporary_directory=temporary_directory,
             output_directory=output_directory,
diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py
index 845992064..06f8e696c 100644
--- a/autoPyTorch/ensemble/ensemble_builder_manager.py
+++ b/autoPyTorch/ensemble/ensemble_builder_manager.py
@@ -20,6 +20,7 @@
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
+from autoPyTorch.ensemble.utils import get_ensemble_builder_class
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.logging_ import get_named_client_logger
 
@@ -37,6 +38,7 @@ def __init__(
         opt_metric: str,
         ensemble_size: int,
         ensemble_nbest: int,
+        ensemble_method: int,
         max_models_on_disc: Union[float, int],
         seed: int,
         precision: int,
@@ -111,6 +113,7 @@ def __init__(
         self.opt_metric = opt_metric
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
+        self.ensemble_method = ensemble_method
         self.max_models_on_disc: Union[float, int] = max_models_on_disc
         self.seed = seed
         self.precision = precision
@@ -210,6 +213,7 @@ def build_ensemble(
                     opt_metric=self.opt_metric,
                     ensemble_size=self.ensemble_size,
                     ensemble_nbest=self.ensemble_nbest,
+                    ensemble_method=self.ensemble_method,
                     max_models_on_disc=self.max_models_on_disc,
                     seed=self.seed,
                     precision=self.precision,
@@ -252,6 +256,7 @@ def fit_and_return_ensemble(
     opt_metric: str,
     ensemble_size: int,
     ensemble_nbest: int,
+    ensemble_method: int,
     max_models_on_disc: Union[float, int],
     seed: int,
     precision: int,
@@ -330,7 +335,8 @@ def fit_and_return_ensemble(
             A list with the performance history of this ensemble, of the form
             [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
     """
-    result = EnsembleBuilder(
+    ensemble_builder = get_ensemble_builder_class(ensemble_method)
+    result = ensemble_builder(
         backend=backend,
         dataset_name=dataset_name,
         task_type=task_type,
diff --git a/autoPyTorch/ensemble/utils.py b/autoPyTorch/ensemble/utils.py
new file mode 100644
index 000000000..705d17e24
--- /dev/null
+++ b/autoPyTorch/ensemble/utils.py
@@ -0,0 +1,16 @@
+from enum import IntEnum
+
+from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
+from autoPyTorch.ensemble.stacking_ensemble_builder import StackingEnsembleBuilder
+
+
+class EnsembleSelectionTypes(IntEnum):
+    ensemble_selection = 1
+    stacking_ensemble = 2
+
+
+def get_ensemble_builder_class(ensemble_method: int):
+    if ensemble_method == EnsembleSelectionTypes.ensemble_selection:
+        return EnsembleBuilder
+    elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble:
+        return StackingEnsembleBuilder

From b6eb0a601d0dd82a41d63e989de26ef4781c16dd Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 17:19:34 +0200
Subject: [PATCH 08/16] working version of levesque et al (clean)

---
 autoPyTorch/api/base_task.py                  | 23 ++++--
 autoPyTorch/ensemble/stacking_ensemble.py     | 42 ++++++++--
 .../ensemble/stacking_ensemble_builder.py     | 50 ++++++------
 autoPyTorch/evaluation/stacking_evaluator.py  |  5 +-
 autoPyTorch/evaluation/tae.py                 | 11 ++-
 autoPyTorch/optimizer/smbo.py                 |  4 +
 .../20_basics/example_stacking_ensemble.py    | 76 +++++++++++++++++++
 7 files changed, 167 insertions(+), 44 deletions(-)
 create mode 100644 examples/20_basics/example_stacking_ensemble.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index cb00e9d73..d4d734e6f 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -761,7 +761,8 @@ def _do_dummy_prediction(self) -> None:
             stats=stats,
             memory_limit=memory_limit,
             disable_file_output=self._disable_file_output,
-            all_supported_metrics=self._all_supported_metrics
+            all_supported_metrics=self._all_supported_metrics,
+            ensemble_method=self.ensemble_method
         )
 
         status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -1290,6 +1291,7 @@ def _search(
                 min_budget=min_budget,
                 max_budget=max_budget,
                 ensemble_callback=proc_ensemble,
+                ensemble_method=self.ensemble_method,
                 logger_port=self._logger_port,
                 # We do not increase the num_run here, this is something
                 # smac does internally
@@ -1820,7 +1822,6 @@ def fit_ensemble(
             precision=precision,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
-            ensemble_method=ensemble_method,
         )
 
         manager.build_ensemble(self._dask_client)
@@ -1990,7 +1991,8 @@ def predict(
     def score(
         self,
         y_pred: np.ndarray,
-        y_test: Union[np.ndarray, pd.DataFrame]
+        y_test: Union[np.ndarray, pd.DataFrame],
+        metric: Optional[str] = None
     ) -> Dict[str, float]:
         """Calculate the score on the test set.
         Calculate the evaluation measure on the test set.
@@ -2005,15 +2007,22 @@ def score(
             Dict[str, float]:
                 Value of the evaluation metric calculated on the test set.
         """
-        if self._metric is None:
-            raise ValueError("No metric found. Either fit/search has not been called yet "
-                             "or AutoPyTorch failed to infer a metric from the dataset ")
+        if metric is not None:
+            required_dataset_properties = {'task_type': self.task_type,
+                                       'output_type': self.dataset.output_type}
+            metric = get_metrics(
+                dataset_properties=required_dataset_properties,
+                names=[metric]
+                )[0]
+        else:
+            metric = self._metric
+
         if self.task_type is None:
             raise ValueError("AutoPytorch failed to infer a task type from the dataset "
                              "Please check the log file for related errors. ")
         return calculate_score(target=y_test, prediction=y_pred,
                                task_type=STRING_TO_TASK_TYPES[self.task_type],
-                               metrics=[self._metric])
+                               metrics=[metric])
 
     def __getstate__(self) -> Dict[str, Any]:
         # Cannot serialize a client!
diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
index f0621c29b..913a3024f 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -1,5 +1,6 @@
 from collections import Counter
 from typing import Any, Dict, List, Tuple, Union
+import warnings
 
 import numpy as np
 from sklearn.base import BaseEstimator
@@ -176,16 +177,17 @@ def _predict(self, predictions, weights):
 
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
-        if len(predictions) == np.count_nonzero(weights):
-            non_null_weights = [w for w in weights if w > 0]
-            for pred, weight in zip(predictions, non_null_weights):
-                np.multiply(pred, weight, out=tmp_predictions)
-                np.add(average, tmp_predictions, out=average)
+        if len([pred for pred in predictions if pred is not None]) == np.count_nonzero(weights):
+            for pred, weight in zip(predictions, weights):
+                if pred is not None:
+                    np.multiply(pred, weight, out=tmp_predictions)
+                    np.add(average, tmp_predictions, out=average)
 
         # If none of the above applies, then something must have gone wrong.
         else:
-            raise ValueError("The dimensions of ensemble predictions"
-                             " and ensemble weights do not match!")
+            raise ValueError(f"{len(predictions)}, {self.weights_}\n"
+                            f"The dimensions of non null ensemble predictions"
+                            f" and ensemble weights do not match!")
         del tmp_predictions
         return average
 
@@ -240,3 +242,29 @@ def predict_with_current_pipeline(
 
         predictions[self.ensemble_slot_j] = pipeline_predictions
         return self._predict(predictions, weights)
+
+    def get_models_with_weights(
+        self,
+        models: Dict[Any, BasePipeline]
+    ) -> List[Tuple[float, BasePipeline]]:
+        """
+        Handy function to tag the provided input models with a given weight.
+        Args:
+            models (List[Tuple[float, BasePipeline]]):
+                A dictionary that maps a model's name to it's actual python object.
+        Returns:
+            output (List[Tuple[float, BasePipeline]]):
+                each model with the related weight, sorted by ascending
+                performance. Notice that ensemble selection solves a minimization
+                problem.
+        """
+        output = []
+        for i, weight in enumerate(self.weights_):
+            if weight > 0.0:
+                identifier = self.identifiers_[i]
+                model = models[identifier]
+                output.append((weight, model))
+
+        output.sort(reverse=True, key=lambda t: t[0])
+
+        return output
\ No newline at end of file
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 836f7884c..ad6136e26 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -6,6 +6,7 @@
 import re
 import time
 import traceback
+import warnings
 import zlib
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -186,6 +187,7 @@ def main(
         )
 
         self.ensemble_slot_j = np.mod(iteration, self.ensemble_size)
+        self.logger.debug(f"Iteration for ensemble building:{iteration}")
         self.ensemble_identifiers = self._load_ensemble_identifiers()
         # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
         if not self.compute_ensemble_loss_per_model():
@@ -233,7 +235,11 @@ def main(
         # Save the ensemble for later use in the main module!
         if ensemble is not None and self.SAVE2DISC:
             self.backend.save_ensemble(ensemble, iteration, self.seed)
-            self._save_ensemble_identifiers(ensemble_identifiers=ensemble.identifiers_)
+            ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_)
+            self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}")
+            self._save_ensemble_identifiers(
+                ensemble_identifiers=ensemble_identifiers
+                )
         # Delete files of non-candidate models - can only be done after fitting the ensemble and
         # saving it to disc so we do not accidentally delete models in the previous ensemble
         if self.max_resident_models is not None:
@@ -246,15 +252,15 @@ def main(
         if ensemble is not None:
             train_pred = self.predict(set_="train",
                                       ensemble=ensemble,
-                                      selected_keys=ensemble.identifiers_,
-                                      n_preds=len(ensemble.identifiers_),
+                                      selected_keys=ensemble_identifiers,
+                                      n_preds=len(ensemble_identifiers),
                                       index_run=iteration)
             # TODO if predictions fails, build the model again during the
             #  next iteration!
             test_pred = self.predict(set_="test",
                                      ensemble=ensemble,
-                                     selected_keys=ensemble.identifiers_,
-                                      n_preds=len(ensemble.identifiers_),
+                                     selected_keys=ensemble_identifiers,
+                                     n_preds=len(ensemble_identifiers),
                                      index_run=iteration)
 
             # Add a score to run history to see ensemble progress
@@ -450,21 +456,6 @@ def fit_ensemble(
             self.read_losses[best_model_identifier]["budget"],
         )
 
-        # check hash if ensemble training data changed
-        current_hash = "".join([
-            str(zlib.adler32(predictions_train[i].data.tobytes()))
-            for i in range(len(predictions_train))
-        ])
-        if self.last_hash == current_hash:
-            self.logger.debug(
-                "No new model predictions selected -- skip ensemble building "
-                "-- current performance: %f",
-                self.validation_performance_,
-            )
-
-            return None
-        self.last_hash = current_hash
-
         opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0]
         if not opt_metric:
             raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} "
@@ -483,6 +474,7 @@ def fit_ensemble(
                 "Fitting the ensemble on %d models.",
                 len(predictions_train),
             )
+            self.logger.debug(f"predictions sent to ensemble: {predictions_train}")
             start_time = time.time()
             ensemble.fit(
                 predictions_train, 
@@ -497,6 +489,7 @@ def fit_ensemble(
                 "Fitting the ensemble took %.2f seconds.",
                 end_time - start_time,
             )
+            self.logger.debug(f"weights = {ensemble.weights_}")
             self.logger.info(str(ensemble))
             self.validation_performance_ = min(
                 self.validation_performance_,
@@ -546,7 +539,9 @@ def predict(self, set_: str,
         else:
             pred_set = Y_ENSEMBLE
 
-        predictions = [self.read_preds[k][pred_set] for k in selected_keys if k is not None]
+        self.logger.debug(f"selected_keys with {set_} for predict are {selected_keys}")
+        predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys]
+        self.logger.debug(f"predictions with {set_} for predict are {len(predictions)}")
 
         if n_preds == len(predictions):
             y = ensemble.predict(predictions)
@@ -562,6 +557,7 @@ def predict(self, set_: str,
                 )
             return y
         else:
+            warnings.warn("this is not true so this is the problem")
             self.logger.info(
                 "Found inconsistent number of predictions and models (%d vs "
                 "%d) for subset %s",
@@ -657,9 +653,9 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray):
                 else:
                     continue
             else:
-                if self.read_preds[identifier][Y_ENSEMBLE] == None:
+                if self.read_preds[identifier][Y_ENSEMBLE] is None:
                     # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration.
-                    raise ValueError("check here to resolve starting condition")
+                    raise ValueError(f"check here to resolve starting condition, {self.read_preds[identifier]}")
                 predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions
 
                 np.add(
@@ -673,7 +669,7 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray):
                     out=fant_ensemble_prediction
                 )
         loss = calculate_loss(
-                metrics=[self.metric],
+                metrics=self.metrics,
                 target=self.y_true_ensemble,
                 prediction=fant_ensemble_prediction,
                 task_type=self.task_type,
@@ -701,8 +697,10 @@ def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Op
             identifier = None
             if num_run is not None:
                 seed, idx, budget = num_run
-                identifier = self.backend.get_prediction_filename(subset, seed, idx, budget)
-
+                identifier = os.path.join(
+                    self.backend.get_numrun_directory(seed, idx, budget),
+                    self.backend.get_prediction_filename(subset, seed, idx, budget)
+                )
             identifiers.append(identifier)
         return identifiers
 
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index eeca7b6e4..11401fe2b 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -195,6 +195,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         if test_loss is not None:
             additional_run_info['test_loss'] = test_loss
 
+        additional_run_info['opt_loss'] = loss
         rval_dict = {'loss': cost,
                      'additional_run_info': additional_run_info,
                      'status': status}
@@ -374,10 +375,10 @@ def _predict(self, pipeline: BaseEstimator,
                                          self.y_train[train_indices])
 
         ensemble_dir = self.backend.get_ensemble_dir()
-        if len(os.listdir(ensemble_dir)) >= 1:
+        if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1:
             old_ensemble = self.backend.load_ensemble(self.seed)
             assert isinstance(old_ensemble, StackingEnsemble)
-            ensemble_opt_pred = old_ensemble.predict_with_current_model(pipeline_opt_pred)
+            ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred)
         else:
             ensemble_opt_pred = pipeline_opt_pred.copy()
 
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 17830ee94..c756d5e8e 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -21,6 +21,8 @@
 from smac.stats.stats import Stats
 from smac.tae import StatusType, TAEAbortException
 from smac.tae.execute_func import AbstractTAFunc
+from autoPyTorch.ensemble import ensemble_selection
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.datasets.resampling_strategy import (
@@ -28,6 +30,7 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes
 )
+import autoPyTorch.evaluation.stacking_evaluator
 from autoPyTorch.evaluation.test_evaluator import eval_test_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
@@ -127,7 +130,8 @@ def __init__(
         ta: Optional[Callable] = None,
         logger_port: int = None,
         all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        ensemble_method = None
     ):
 
         self.backend = backend
@@ -146,7 +150,10 @@ def __init__(
         self.resampling_strategy_args = dm.resampling_strategy_args
 
         if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
-            eval_function = eval_train_function
+            if ensemble_method is None or ensemble_method == EnsembleSelectionTypes.ensemble_selection:
+                eval_function = eval_train_function
+            elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble:
+                eval_function = autoPyTorch.evaluation.stacking_evaluator.eval_function
             self.output_y_hat_optimization = output_y_hat_optimization
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
             eval_function = eval_test_function
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index b6242e379..945ff880d 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -25,6 +25,7 @@
     NoResamplingStrategyTypes
 )
 from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.optimizer.utils import read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
@@ -115,6 +116,7 @@ def __init__(self,
                  pynisher_context: str = 'spawn',
                  min_budget: int = 5,
                  max_budget: int = 50,
+                 ensemble_method: int = EnsembleSelectionTypes.ensemble_selection
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -228,6 +230,7 @@ def __init__(self,
         self.pynisher_context = pynisher_context
         self.min_budget = min_budget
         self.max_budget = max_budget
+        self.ensemble_method = ensemble_method
 
         self.ensemble_callback = ensemble_callback
 
@@ -292,6 +295,7 @@ def run_smbo(self, func: Optional[Callable] = None
             pipeline_config=self.pipeline_config,
             search_space_updates=self.search_space_updates,
             pynisher_context=self.pynisher_context,
+            ensemble_method=self.ensemble_method
         )
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py
new file mode 100644
index 000000000..4ceefda8d
--- /dev/null
+++ b/examples/20_basics/example_stacking_ensemble.py
@@ -0,0 +1,76 @@
+"""
+======================
+Tabular Classification
+======================
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import sklearn.datasets
+import sklearn.model_selection
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
+
+############################################################################
+# Data Loading
+# ============
+X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X[:200],
+    y[:200],
+    random_state=1,
+)
+
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    # To maintain logs of the run, you can uncomment the
+    # Following lines
+    temporary_directory='./tmp/autoPyTorch_example_tmp_02',
+    output_directory='./tmp/autoPyTorch_example_out_02',
+    delete_tmp_folder_after_terminate=False,
+    delete_output_folder_after_terminate=False,
+    seed=42,
+    ensemble_method=EnsembleSelectionTypes.stacking_ensemble,
+    ensemble_size=5
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    dataset_name='Australian',
+    optimize_metric='zero_one_loss',
+    total_walltime_limit=300,
+    func_eval_time_limit_secs=50,
+    enable_traditional_pipeline=False
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test, metric='accuracy')
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
+
+# Print statistics from search
+# print(api.sprint_statistics())
\ No newline at end of file

From 3681c8f0b1d21e21668545c9b6721bf186ae374b Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 17:21:53 +0200
Subject: [PATCH 09/16] create callback and new smbo

---
 .../ensemble/ensemble_builder_manager.py      |   1 -
 autoPyTorch/optimizer/run_history_callback.py | 293 ++++++++++++++++++
 autoPyTorch/optimizer/utils.py                | 128 +++++++-
 3 files changed, 420 insertions(+), 2 deletions(-)
 create mode 100644 autoPyTorch/optimizer/run_history_callback.py

diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py
index 06f8e696c..7c0786bb9 100644
--- a/autoPyTorch/ensemble/ensemble_builder_manager.py
+++ b/autoPyTorch/ensemble/ensemble_builder_manager.py
@@ -19,7 +19,6 @@
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
-from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
 from autoPyTorch.ensemble.utils import get_ensemble_builder_class
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.logging_ import get_named_client_logger
diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py
new file mode 100644
index 000000000..02d0616ba
--- /dev/null
+++ b/autoPyTorch/optimizer/run_history_callback.py
@@ -0,0 +1,293 @@
+from json import dump, load
+import json
+import logging
+import os
+import pickle
+import re
+import time
+import traceback
+from typing import List, Union, Dict, Tuple, Optional
+
+import dask.distributed
+from distributed.utils import Any
+from numpy.random.mtrand import seed
+
+from smac.optimizer.smbo import SMBO
+from smac.runhistory.runhistory import RunInfo, RunKey
+from torch.utils import data
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes
+
+from autoPyTorch.optimizer.utils import AdjustRunHistoryCallback
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.automl_common.common.utils.logging_ import get_named_client_logger
+
+
+MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
+
+class RunHistoryUpdaterManager(AdjustRunHistoryCallback):
+    def __init__(
+        self,
+        backend: Backend,
+        random_state: int,
+        dataset_name: str,
+        resampling_strategy: Union[
+            HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
+        ],
+        resampling_strategy_args: Dict[str, Any], 
+        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+    ):
+        """ 
+        SMAC callback to update run history
+        Args:
+            backend: util.backend.Backend
+                backend to write and read files
+            logger_port: int
+                port in where to publish a msg
+
+        Returns:
+            List[Tuple[int, float, float, float]]:
+                A list with the performance history of this ensemble, of the form
+                [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
+        """
+
+        self.backend = backend
+        
+        self.random_state = random_state
+        self.logger_port = logger_port
+
+        # We only submit new ensembles when there is not an active ensemble job
+        self.futures: List[dask.Future] = []
+
+        # The last criteria is the number of iterations
+        self.iteration = 0
+
+        # Keep track of when we started to know when we need to finish!
+        self.start_time = time.time()
+        self.dataset_name = dataset_name
+        self.resampling_strategy = resampling_strategy
+        self.resampling_strategy_args = resampling_strategy_args
+
+    def __call__(
+        self,
+        smbo: 'SMBO',
+    ) -> None:
+        self.adjust_run_history(smbo.tae_runner.client)
+
+    def adjust_run_history(
+        self,
+        dask_client: dask.distributed.Client,
+        unit_test: bool = False
+    ) -> None:
+
+        # The second criteria is elapsed time
+        elapsed_time = time.time() - self.start_time
+
+        logger = get_named_client_logger(
+            name='EnsembleBuilder',
+            port=self.logger_port,
+        )
+
+        if len(self.futures) != 0:
+            if self.futures[0].done():
+                result = self.futures.pop().result()
+                if result:
+                    ensemble_history, self.ensemble_nbest, _, _ = result
+                    logger.debug("iteration={} @ elapsed_time={} has history={}".format(
+                        self.iteration,
+                        elapsed_time,
+                        ensemble_history,
+                    ))
+
+        # Only submit new jobs if the previous ensemble job finished
+        if len(self.futures) == 0:
+
+            # Add the result of the run
+            # On the next while iteration, no references to
+            # ensemble builder object, so it should be garbage collected to
+            # save memory while waiting for resources
+            # Also, notice how ensemble nbest is returned, so we don't waste
+            # iterations testing if the deterministic predictions size can
+            # be fitted in memory
+            try:
+                # Submit a Dask job from this job, to properly
+                # see it in the dask diagnostic dashboard
+                # Notice that the forked ensemble_builder_process will
+                # wait for the below function to be done
+                self.futures.append(
+                    dask_client.submit(
+                        return_run_info_cost,
+                        backend=self.backend,
+                        dataset_name=self.dataset_name,
+                        iteration=self.iteration,
+                        resampling_strategy=self.resampling_strategy,
+                        resampling_strategy_args=self.resampling_strategy_args,
+                        logger_port=self.logger_port,
+                        priority=100
+                        )
+                )
+
+                logger.info(
+                    "{}/{} Started Ensemble builder job at {} for iteration {}.".format(
+                        # Log the client to make sure we
+                        # remain connected to the scheduler
+                        self.futures[0],
+                        dask_client,
+                        time.strftime("%Y.%m.%d-%H.%M.%S"),
+                        self.iteration,
+                    ),
+                )
+                self.iteration += 1
+            except Exception as e:
+                exception_traceback = traceback.format_exc()
+                error_message = repr(e)
+                logger.critical(exception_traceback)
+                logger.critical(error_message)
+
+
+def return_run_info_cost(
+    backend: Backend,
+    dataset_name: str,
+    resampling_strategy: Union[
+        HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
+    ],
+    resampling_strategy_args: Dict[str, Any],
+    iteration: int,
+    logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+) -> Optional[List[Tuple[RunKey, float]]]:
+    """
+    A short function to fit and create an ensemble. It is just a wrapper to easily send
+    a request to dask to create an ensemble and clean the memory when finished
+    Parameters
+    ----------
+        backend: util.backend.Backend
+            backend to write and read files
+        dataset_name: str
+            name of dataset
+        metrics: List[autoPyTorchMetric],
+            A set of metrics that will be used to get performance estimates
+        opt_metric:
+            Name of the metric to optimize
+        task_type: int
+            type of output expected in the ground truth
+        ensemble_size: int
+            maximal size of ensemble (passed to ensemble.ensemble_selection)
+        ensemble_nbest: int/float
+            if int: consider only the n best prediction
+            if float: consider only this fraction of the best models
+            Both wrt to validation predictions
+            If performance_range_threshold > 0, might return less models
+        max_models_on_disc: int
+           Defines the maximum number of models that are kept in the disc.
+           If int, it must be greater or equal than 1, and dictates the max number of
+           models to keep.
+           If float, it will be interpreted as the max megabytes allowed of disc space. That
+           is, if the number of ensemble candidates require more disc space than this float
+           value, the worst models will be deleted to keep within this budget.
+           Models and predictions of the worst-performing models will be deleted then.
+           If None, the feature is disabled.
+           It defines an upper bound on the models that can be used in the ensemble.
+        seed: int
+            random seed
+        precision (int): [16,32,64,128]
+            precision of floats to read the predictions
+        memory_limit: Optional[int]
+            memory limit in mb. If ``None``, no memory limit is enforced.
+        read_at_most: int
+            read at most n new prediction files in each iteration
+        end_at: float
+            At what time the job must finish. Needs to be the endtime and not the time left
+            because we do not know when dask schedules the job.
+        iteration: int
+            The current iteration
+        pynisher_context: str
+            Context to use for multiprocessing, can be either fork, spawn or forkserver.
+        logger_port: int
+            The port where the logging server is listening to.
+        unit_test: bool
+            Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
+            Having this is very bad coding style, but I did not find a way to make
+            unittest.mock work through the pynisher with all spawn contexts. If you know a
+            better solution, please let us know by opening an issue.
+    Returns
+    -------
+        List[Tuple[int, float, float, float]]
+            A list with the performance history of this ensemble, of the form
+            [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
+    """
+    result = RunHistoryUpdater(
+        backend=backend,
+        dataset_name=dataset_name,
+        resampling_strategy=resampling_strategy,
+        resampling_strategy_args=resampling_strategy_args,
+        logger_port=logger_port,
+    ).run(
+        iteration=iteration,
+    )
+    return result
+
+
+class RunHistoryUpdater:
+    def __init__(
+        self,
+        backend: Backend,
+        dataset_name: str,
+        resampling_strategy: Union[
+            HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
+        ],
+        resampling_strategy_args: Dict[str, Any], 
+        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+    ):
+        """ 
+        SMAC callback to update run history
+        Args:
+            backend: util.backend.Backend
+                backend to write and read files
+            logger_port: int
+                port in where to publish a msg
+
+        Returns:
+            List[Tuple[int, float, float, float]]:
+                A list with the performance history of this ensemble, of the form
+                [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
+        """
+
+        self.model_fn_re = re.compile(MODEL_FN_RE)
+        self.logger_port = logger_port
+        self.logger = get_named_client_logger(
+            name='RunHistoryUpdater',
+            port=self.logger_port,
+        )
+        self.ensemble_loss_file = os.path.join(backend.internals_directory, 'ensemble_read_losses.pkl')
+        if isinstance(resampling_strategy, CrossValTypes):
+            num_splits = resampling_strategy_args['num_splits']
+            self.instances = [[json.dumps({'task_id': dataset_name,
+                                      'fold': fold_number})]
+                         for fold_number in range(num_splits)]
+        else:
+            self.instances = [[json.dumps({'task_id': dataset_name})]]
+
+    def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]:
+        results: List[Tuple[RunInfo, float]] = []
+        if os.path.exists(self.ensemble_loss_file):
+            try:
+                with (open(self.ensemble_loss_file, "rb")) as memory:
+                    read_losses = pickle.load(memory)
+            except Exception as e:
+                self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}")
+                return
+            else:
+                for k in read_losses.keys():
+                    match = self.model_fn_re.search(k)
+                    if match is None or read_losses[k]["loaded"] != 1:
+                        continue
+                    else:
+                        _num_run = int(match.group(2))
+                        _budget = float(match.group(3))
+                        run_key = RunKey(
+                            seed=0,  # 0 is hardcoded for the runhistory coming from smac
+                            config_id=_num_run,
+                            budget=_budget,
+                            instance_id=self.instances[-1]
+                        )
+                        results.append((run_key, read_losses[k]["ens_loss"]))
+        return results
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 6fb9d5024..37c6795fc 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -1,10 +1,25 @@
 import json
 import os
 import warnings
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
+import numpy as np
+
+from smac.optimizer.smbo import SMBO
+from smac.scenario.scenario import Scenario
+from smac.stats.stats import Stats
+from smac.initial_design.initial_design import InitialDesign
+from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue
+from smac.runhistory.runhistory2epm import AbstractRunHistory2EPM
+from smac.intensification.abstract_racer import AbstractRacer
+from smac.epm.rf_with_instances import RandomForestWithInstances
+from smac.optimizer.ei_optimization import AbstractAcquisitionFunction, AcquisitionFunctionMaximizer
+from smac.tae import FirstRunCrashedException, StatusType, TAEAbortException
+from smac.tae.base import BaseRunner
+from smac.optimizer.random_configuration_chooser import RandomConfigurationChooser, ChooserNoCoolDown
+
 
 def read_return_initial_configurations(
     config_space: ConfigurationSpace,
@@ -31,3 +46,114 @@ def read_return_initial_configurations(
                           f"Therefore, it can't be used as an initial "
                           f"configuration as it does not match the current config space. ")
     return initial_configurations
+
+class AdjustRunHistoryCallback:
+    """
+    Allows manipulating run history for custom needs
+    """
+    def __call__(self, smbo: 'SMBO') -> RunHistory:
+        pass
+
+class autoPyTorchSMBO(SMBO):
+    def __init__(self,
+                 scenario: Scenario,
+                 stats: Stats,
+                 initial_design: InitialDesign,
+                 runhistory: RunHistory,
+                 runhistory2epm: AbstractRunHistory2EPM,
+                 intensifier: AbstractRacer,
+                 num_run: int,
+                 model: RandomForestWithInstances,
+                 acq_optimizer: AcquisitionFunctionMaximizer,
+                 acquisition_func: AbstractAcquisitionFunction,
+                 rng: np.random.RandomState,
+                 tae_runner: BaseRunner,
+                 restore_incumbent: Configuration = None,
+                 random_configuration_chooser: Union[RandomConfigurationChooser] = ChooserNoCoolDown(2.0),
+                 predict_x_best: bool = True,
+                 min_samples_model: int = 1):
+        super().__init__(
+            scenario, 
+            stats, 
+            initial_design, 
+            runhistory, 
+            runhistory2epm, 
+            intensifier, 
+            num_run, 
+            model, 
+            acq_optimizer, 
+            acquisition_func, 
+            rng, 
+            tae_runner, 
+            restore_incumbent, 
+            random_configuration_chooser, 
+            predict_x_best, 
+            min_samples_model, 
+        )
+        self._callbacks.update({'_adjust_run_history': list()})
+        self._callback_to_key.update({AdjustRunHistoryCallback: '_adjust_run_history'})
+
+    def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_left: float) -> None:
+        # update SMAC stats
+        self.stats.ta_time_used += float(result.time)
+        self.stats.finished_ta_runs += 1
+
+        self.logger.debug(
+            "Return: Status: %r, cost: %f, time: %f, additional: %s" % (
+                result.status, result.cost, result.time, str(result.additional_info)
+            )
+        )
+
+        self.runhistory.add(
+            config=run_info.config,
+            cost=result.cost,
+            time=result.time,
+            status=result.status,
+            instance_id=run_info.instance,
+            seed=run_info.seed,
+            budget=run_info.budget,
+            starttime=result.starttime,
+            endtime=result.endtime,
+            force_update=True,
+            additional_info=result.additional_info,
+        )
+        self.stats.n_configs = len(self.runhistory.config_ids)
+
+        if result.status == StatusType.ABORT:
+            raise TAEAbortException("Target algorithm status ABORT - SMAC will "
+                                    "exit. The last incumbent can be found "
+                                    "in the trajectory-file.")
+        elif result.status == StatusType.STOP:
+            self._stop = True
+            return
+
+        if self.scenario.abort_on_first_run_crash:  # type: ignore[attr-defined] # noqa F821
+            if self.stats.finished_ta_runs == 1 and result.status == StatusType.CRASHED:
+                raise FirstRunCrashedException(
+                    "First run crashed, abort. Please check your setup -- we assume that your default "
+                    "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option "
+                    "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info
+                )
+        for callback in self._callbacks['_incorporate_run_results']:
+            response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left)
+            # If a callback returns False, the optimization loop should be interrupted
+            # the other callbacks are still being called
+            if response is False:
+                self.logger.debug("An IncorporateRunResultCallback returned False, requesting abort.")
+                self._stop = True
+
+        for callback in self._callbacks['_adjust_run_history']:
+            result = callback(smbo=self)
+        # Update the intensifier with the result of the runs
+        self.incumbent, inc_perf = self.intensifier.process_results(
+            run_info=run_info,
+            incumbent=self.incumbent,
+            run_history=self.runhistory,
+            time_bound=max(self._min_time, time_left),
+            result=result,
+        )
+
+        if self.scenario.save_instantly:  # type: ignore[attr-defined] # noqa F821
+            self.save()
+
+        return

From adca8d2e3aed868bc937270c1d8feeec1ab7df9b Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 17:39:16 +0200
Subject: [PATCH 10/16]  Fixed bugs in running paper code, should be fine now
 (clean)

---
 autoPyTorch/api/base_task.py                  |  38 +++-
 autoPyTorch/api/tabular_classification.py     |   4 +
 .../ensemble/stacking_ensemble_builder.py     | 163 ++++++------------
 autoPyTorch/evaluation/stacking_evaluator.py  |   4 +-
 autoPyTorch/optimizer/run_history_callback.py |   9 +-
 autoPyTorch/optimizer/smbo.py                 |  16 +-
 autoPyTorch/optimizer/utils.py                |  13 +-
 .../20_basics/example_stacking_ensemble.py    |  13 +-
 8 files changed, 135 insertions(+), 125 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index d4d734e6f..3366c6bad 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -27,6 +27,7 @@
 
 import pandas as pd
 
+from smac.optimizer.smbo import SMBO
 from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
 from smac.stats.stats import Stats
 from smac.tae import StatusType
@@ -53,6 +54,7 @@
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.optimizer.run_history_callback import RunHistoryUpdaterManager
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
@@ -974,7 +976,8 @@ def _search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
-        dask_client: Optional[dask.distributed.Client] = None
+        dask_client: Optional[dask.distributed.Client] = None,
+        smbo_class: Optional[SMBO] = None
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1215,7 +1218,7 @@ def _search(
 
         # ============> Run dummy predictions
         # We only want to run dummy predictions in case we want to build an ensemble
-        if self.ensemble_size > 0:
+        if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_ensemble:
             dummy_task_name = 'runDummy'
             self._stopwatch.start_task(dummy_task_name)
             self._do_dummy_prediction()
@@ -1248,7 +1251,6 @@ def _search(
         else:
             self._logger.info("Starting ensemble")
             ensemble_task_name = 'ensemble'
-            self._stopwatch.start_task(ensemble_task_name)
             proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
                                                         ensemble_size=self.ensemble_size,
                                                         ensemble_nbest=self.ensemble_nbest,
@@ -1256,7 +1258,12 @@ def _search(
                                                         optimize_metric=self.opt_metric,
                                                         ensemble_method=self.ensemble_method
                                                         )
-            self._stopwatch.stop_task(ensemble_task_name)
+        proc_runhistory_updater = None
+        if (
+            self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble
+            and smbo_class is not None
+        ):
+            proc_runhistory_updater = self._init_result_history_updater()
 
         # ==> Run SMAC
         smac_task_name: str = 'runSMAC'
@@ -1299,6 +1306,8 @@ def _search(
                 search_space_updates=self.search_space_updates,
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
+                smbo_class = smbo_class,
+                other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None
             )
             try:
                 run_history, self._results_manager.trajectory, budget_type = \
@@ -1934,6 +1943,27 @@ def _collect_results_ensemble(
             pd.DataFrame(self.ensemble_performance_history).to_json(
                 os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
 
+    def _init_result_history_updater(self):
+        if self.dataset is None:
+            raise ValueError("runhistory updater can only be initialised after or during `search()`. "
+                             "Please call the `search()` method of {}.".format(self.__class__.__name__))
+
+        self._logger.info("Starting Runhistory updater")
+        runhistory_task_name = 'runhistory_updater'
+        self._stopwatch.start_task(runhistory_task_name)
+
+        proc_runhistory_updater = RunHistoryUpdaterManager(
+            backend=self._backend,
+            dataset_name=self.dataset_name,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+            logger_port=self._logger_port
+        )
+
+        self._stopwatch.stop_task(runhistory_task_name)
+
+        return proc_runhistory_updater
+
     def predict(
         self,
         X_test: np.ndarray,
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index da1cf293b..5641c1005 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -4,6 +4,8 @@
 
 import pandas as pd
 
+from smac.optimizer.smbo import SMBO
+
 from autoPyTorch.api.base_task import BaseTask
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
@@ -256,6 +258,7 @@ def search(
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
         dataset_compression: Union[Mapping[str, Any], bool] = False,
+        smbo_class: Optional[SMBO] = None
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -455,6 +458,7 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
+            smbo_class=smbo_class            
         )
 
     def predict(
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index ad6136e26..9b324af8b 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -117,7 +117,7 @@ def __init__(
         # we can do this by either storing and reading them in this class
         # or passing them via the ensemble builder manager which has persistency with the futures stored.
         self.ensemble_identifiers: Optional[List[Optional[str]]] = None
-
+        self.read_losses = {}
 
     # TODO: This is the main wrapper to the EnsembleSelection class which fits
     # TODO: the ensemble
@@ -207,18 +207,9 @@ def main(
 
         # populates test predictions in self.read_preds
         # reduces selected models if file reading failed
-        n_sel_test = self.get_test_preds(selected_keys=candidate_models)
-
-        # If any of n_sel_* is not empty and overlaps with candidate_models,
-        # then ensure candidate_models AND n_sel_test are sorted the same
-        candidate_models_set = set(candidate_models)
-        if candidate_models_set.intersection(n_sel_test):
-            candidate_models = sorted(list(candidate_models_set.intersection(
-                n_sel_test)))
-            n_sel_test = candidate_models
-        else:
-            # This has to be the case
-            n_sel_test = []
+        candidate_models = self.get_test_preds(selected_keys=candidate_models)
+
+        self.logger.debug(f"n_sel_test: {candidate_models}")
 
         if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'):
             for candidate in candidate_models:
@@ -227,6 +218,8 @@ def main(
         # as candidate models is sorted in `get_n_best_preds`
         best_model_identifier = candidate_models[0]
 
+        self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}")
+
         # train ensemble
         ensemble = self.fit_ensemble(
             best_model_identifier=best_model_identifier
@@ -334,6 +327,7 @@ def compute_ensemble_loss_per_model(self) -> bool:
         # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
         # as a returning list, so as a work-around we skip next line
         for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
+            self.logger.debug(f"This is for model {y_ens_fn}")
             if self.read_at_most and n_read_files >= self.read_at_most:
                 # limit the number of files that will be read
                 # to limit memory consumption
@@ -343,22 +337,21 @@ def compute_ensemble_loss_per_model(self) -> bool:
                 self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn)
                 continue
 
-            if not self.read_losses.get(y_ens_fn):
-                self.read_losses[y_ens_fn] = {
-                    "ens_loss": np.inf,
-                    "mtime_ens": 0,
-                    "mtime_test": 0,
-                    "seed": _seed,
-                    "num_run": _num_run,
-                    "budget": _budget,
-                    "disc_space_cost_mb": None,
-                    # Lazy keys so far:
-                    # 0 - not loaded
-                    # 1 - loaded and in memory
-                    # 2 - loaded but dropped again
-                    # 3 - deleted from disk due to space constraints
-                    "loaded": 0
-                }
+            self.read_losses[y_ens_fn] = {
+                "ens_loss": np.inf,
+                "mtime_ens": 0,
+                "mtime_test": 0,
+                "seed": _seed,
+                "num_run": _num_run,
+                "budget": _budget,
+                "disc_space_cost_mb": None,
+                # Lazy keys so far:
+                # 0 - not loaded
+                # 1 - loaded and in memory
+                # 2 - loaded but dropped again
+                # 3 - deleted from disk due to space constraints
+                "loaded": 0
+            }
             if not self.read_preds.get(y_ens_fn):
                 self.read_preds[y_ens_fn] = {
                     Y_ENSEMBLE: None,
@@ -371,20 +364,12 @@ def compute_ensemble_loss_per_model(self) -> bool:
 
             # actually read the predictions and compute their respective loss
             try:
+                ensemble_idenitfiers = self.ensemble_identifiers.copy()
+                ensemble_idenitfiers[self.ensemble_slot_j] = y_ens_fn
                 y_ensemble = self._read_np_fn(y_ens_fn)
                 losses = self.get_ensemble_loss_with_model(
-                    model_predictions=y_ensemble
-                    )
-
-                if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]):
-                    self.logger.debug(
-                        'Changing ensemble loss for file %s from %f to %f '
-                        'because file modification time changed? %f - %f',
-                        y_ens_fn,
-                        self.read_losses[y_ens_fn]["ens_loss"],
-                        losses[self.opt_metric],
-                        self.read_losses[y_ens_fn]["mtime_ens"],
-                        os.path.getmtime(y_ens_fn),
+                    model_predictions=y_ensemble,
+                    ensemble_identifiers=ensemble_idenitfiers
                     )
 
                 self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
@@ -470,11 +455,11 @@ def fit_ensemble(
         )
 
         try:
-            self.logger.debug(
-                "Fitting the ensemble on %d models.",
-                len(predictions_train),
-            )
-            self.logger.debug(f"predictions sent to ensemble: {predictions_train}")
+            # self.logger.debug(
+            #     "Fitting the ensemble on %d models.",
+            #     len(predictions_train),
+            # )
+            # self.logger.debug(f"predictions sent to ensemble: {predictions_train}")
             start_time = time.time()
             ensemble.fit(
                 predictions_train, 
@@ -489,7 +474,7 @@ def fit_ensemble(
                 "Fitting the ensemble took %.2f seconds.",
                 end_time - start_time,
             )
-            self.logger.debug(f"weights = {ensemble.weights_}")
+            # self.logger.debug(f"weights = {ensemble.weights_}")
             self.logger.info(str(ensemble))
             self.validation_performance_ = min(
                 self.validation_performance_,
@@ -539,9 +524,7 @@ def predict(self, set_: str,
         else:
             pred_set = Y_ENSEMBLE
 
-        self.logger.debug(f"selected_keys with {set_} for predict are {selected_keys}")
         predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys]
-        self.logger.debug(f"predictions with {set_} for predict are {len(predictions)}")
 
         if n_preds == len(predictions):
             y = ensemble.predict(predictions)
@@ -582,35 +565,6 @@ def get_candidate_preds(self) -> List[str]:
 
         sorted_keys = self._get_list_of_sorted_preds()
 
-        # number of models available
-        num_keys = len(sorted_keys)
-        # remove all that are at most as good as random
-        # note: dummy model must have run_id=1 (there is no run_id=0)
-        dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys))
-        # Leave this here for when we enable dummy classifier/scorer
-        if len(dummy_losses) > 0:
-            # number of dummy models
-            num_dummy = len(dummy_losses)
-            dummy_loss = dummy_losses[0]
-            self.logger.debug("Use %f as dummy loss" % dummy_loss[1])
-            sorted_keys = list(filter(lambda x: x[1] < dummy_loss[1], sorted_keys))
-
-            # remove Dummy Classifier
-            sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys))
-            if len(sorted_keys) == 0:
-                # no model left; try to use dummy loss (num_run==0)
-                # log warning when there are other models but not better than dummy model
-                if num_keys > num_dummy:
-                    self.logger.warning("No models better than random - using Dummy Score!"
-                                        "Number of models besides current dummy model: %d. "
-                                        "Number of dummy models: %d",
-                                        num_keys - 1,
-                                        num_dummy)
-                sorted_keys = [
-                    (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items()
-                    if v["seed"] == self.seed and v["num_run"] == 1
-                ]
-
         # reduce to keys
         reduced_sorted_keys = list(map(lambda x: x[0], sorted_keys))
 
@@ -629,7 +583,10 @@ def get_candidate_preds(self) -> List[str]:
         # return best scored keys of self.read_losses
         return reduced_sorted_keys
 
-    def get_ensemble_loss_with_model(self, model_predictions: np.ndarray):
+    def get_ensemble_loss_with_model(self,
+        model_predictions: np.ndarray,
+        ensemble_identifiers: List[str]
+        ):
         """
         Gets the loss of the ensemble given slot j and predictions for new model at slot j
         set is ensemble
@@ -637,41 +594,33 @@ def get_ensemble_loss_with_model(self, model_predictions: np.ndarray):
             model_predictions ([type]): [description]
         """
 
-        weighted_ensemble_prediction = np.zeros(
-            model_predictions.shape,
-            dtype=np.float64,
-        )
-        fant_ensemble_prediction = np.zeros(
-            weighted_ensemble_prediction.shape,
-            dtype=np.float64,
-        )
+        # self.logger.debug(f"in ensemble_loss predictions for current are \n{model_predictions}")
+        self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}")
+
+        average_predictions = np.zeros_like(model_predictions, dtype=np.float64)
+        tmp_predictions = np.empty_like(model_predictions, dtype=np.float64)
+        nonnull_identifiers = len([identifier for identifier in ensemble_identifiers if identifier is not None])
 
-        for i, identifier in enumerate(self.ensemble_identifiers):
-            if identifier is None:
-                if i == self.ensemble_slot_j:
+        self.logger.debug(f"non null identifiers : {nonnull_identifiers}")
+        weight = 1. / float(nonnull_identifiers)
+        # if prediction model.shape[0] == len(non_null_weights),
+        # predictions do not include those of zero-weight models.
+        for identifier in ensemble_identifiers:
+            if identifier is not None:
+                if self.read_preds[identifier][Y_ENSEMBLE] is None:
                     predictions = model_predictions
                 else:
-                    continue
+                    predictions = self.read_preds[identifier][Y_ENSEMBLE]
             else:
-                if self.read_preds[identifier][Y_ENSEMBLE] is None:
-                    # y ensemble read_preds is loaded in get_n_best_preds. If there is no value for this that means its a new model at this iteration.
-                    raise ValueError(f"check here to resolve starting condition, {self.read_preds[identifier]}")
-                predictions = self.read_preds[identifier][Y_ENSEMBLE] if i != self.ensemble_slot_j else model_predictions
-
-                np.add(
-                    weighted_ensemble_prediction,
-                    predictions,
-                    out=fant_ensemble_prediction
-                )
-                np.multiply(
-                    fant_ensemble_prediction,
-                    (1. / float(self.ensemble_size)),
-                    out=fant_ensemble_prediction
-                )
+                break
+
+            np.multiply(predictions, weight, out=tmp_predictions)
+            np.add(average_predictions, tmp_predictions, out=average_predictions)
+
         loss = calculate_loss(
                 metrics=self.metrics,
                 target=self.y_true_ensemble,
-                prediction=fant_ensemble_prediction,
+                prediction=average_predictions,
                 task_type=self.task_type,
             )
         return loss
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index 11401fe2b..d01c846b0 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -11,6 +11,7 @@
 
 from smac.tae import StatusType
 
+from autoPyTorch import ensemble
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
@@ -160,7 +161,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
 
         if file_output:
             loss_, additional_run_info_ = self.file_output(
-                ensemble_opt_pred, valid_pred, test_pred
+                pipeline_opt_pred, valid_pred, test_pred
             )
         else:
             loss_ = None
@@ -382,6 +383,7 @@ def _predict(self, pipeline: BaseEstimator,
         else:
             ensemble_opt_pred = pipeline_opt_pred.copy()
 
+        self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
         if self.X_valid is not None:
             valid_pred = self.predict_function(self.X_valid, pipeline,
                                                self.y_valid)
diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py
index 02d0616ba..6b020e174 100644
--- a/autoPyTorch/optimizer/run_history_callback.py
+++ b/autoPyTorch/optimizer/run_history_callback.py
@@ -28,7 +28,6 @@ class RunHistoryUpdaterManager(AdjustRunHistoryCallback):
     def __init__(
         self,
         backend: Backend,
-        random_state: int,
         dataset_name: str,
         resampling_strategy: Union[
             HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
@@ -52,7 +51,6 @@ def __init__(
 
         self.backend = backend
         
-        self.random_state = random_state
         self.logger_port = logger_port
 
         # We only submit new ensembles when there is not an active ensemble job
@@ -91,11 +89,11 @@ def adjust_run_history(
             if self.futures[0].done():
                 result = self.futures.pop().result()
                 if result:
-                    ensemble_history, self.ensemble_nbest, _, _ = result
-                    logger.debug("iteration={} @ elapsed_time={} has history={}".format(
+                    response = result
+                    logger.debug("iteration={} @ elapsed_time={} has response={}".format(
                         self.iteration,
                         elapsed_time,
-                        ensemble_history,
+                        response,
                     ))
 
         # Only submit new jobs if the previous ensemble job finished
@@ -267,6 +265,7 @@ def __init__(
             self.instances = [[json.dumps({'task_id': dataset_name})]]
 
     def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]:
+        self.logger.info(f"Starting iteration {iteration} of run history updater")
         results: List[Tuple[RunInfo, float]] = []
         if os.path.exists(self.ensemble_loss_file):
             try:
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 945ff880d..1238e608d 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -10,6 +10,7 @@
 
 from smac.facade.smac_ac_facade import SMAC4AC
 from smac.intensification.hyperband import Hyperband
+from smac.optimizer.smbo import SMBO
 from smac.runhistory.runhistory import RunHistory
 from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
 from smac.scenario.scenario import Scenario
@@ -43,6 +44,7 @@ def get_smac_object(
     initial_budget: int,
     max_budget: int,
     dask_client: Optional[dask.distributed.Client],
+    smbo_class: Optional[SMBO] = None,
     initial_configurations: Optional[List[Configuration]] = None,
 ) -> SMAC4AC:
     """
@@ -80,6 +82,7 @@ def get_smac_object(
                             'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'},
         dask_client=dask_client,
         n_jobs=n_jobs,
+        smbo_class=smbo_class
     )
 
 
@@ -116,7 +119,9 @@ def __init__(self,
                  pynisher_context: str = 'spawn',
                  min_budget: int = 5,
                  max_budget: int = 50,
-                 ensemble_method: int = EnsembleSelectionTypes.ensemble_selection
+                 ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+                 other_callbacks: Optional[List] = None,
+                 smbo_class: Optional[SMBO] = None
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -234,6 +239,9 @@ def __init__(self,
 
         self.ensemble_callback = ensemble_callback
 
+        self.other_callbacks = other_callbacks
+        self.smbo_class = smbo_class
+
         self.search_space_updates = search_space_updates
 
         if logger_port is None:
@@ -362,11 +370,15 @@ def run_smbo(self, func: Optional[Callable] = None
                                    initial_budget=self.min_budget,
                                    max_budget=self.max_budget,
                                    dask_client=self.dask_client,
-                                   initial_configurations=self.initial_configurations)
+                                   initial_configurations=self.initial_configurations,
+                                   smbo_class=self.smbo_class)
 
         if self.ensemble_callback is not None:
             smac.register_callback(self.ensemble_callback)
 
+        if self.other_callbacks is not None:
+            for callback in self.other_callbacks:
+                smac.register_callback(callback)
         self.logger.info("initialised SMBO, running SMBO.optimize()")
 
         smac.optimize()
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 37c6795fc..b15cf2580 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -20,6 +20,7 @@
 from smac.tae.base import BaseRunner
 from smac.optimizer.random_configuration_chooser import RandomConfigurationChooser, ChooserNoCoolDown
 
+from autoPyTorch.utils.common import dict_repr
 
 def read_return_initial_configurations(
     config_space: ConfigurationSpace,
@@ -134,6 +135,7 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef
                     "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option "
                     "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info
                 )
+        self.logger.debug(f"\nbefore ensemble, result: {result}, \nrunhistory: {self.runhistory.data}")
         for callback in self._callbacks['_incorporate_run_results']:
             response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left)
             # If a callback returns False, the optimization loop should be interrupted
@@ -143,7 +145,16 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef
                 self._stop = True
 
         for callback in self._callbacks['_adjust_run_history']:
-            result = callback(smbo=self)
+            response = callback(smbo=self)
+            if response is not None:
+                for run_key, cost in response:
+                    run_value = self.runhistory.data.get(run_key, None)
+                    if run_value is not None:
+                        run_value.cost = cost
+                self.epm_chooser.runhistory = self.runhistory
+
+        self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}")
+
         # Update the intensifier with the result of the runs
         self.incumbent, inc_perf = self.intensifier.process_results(
             run_info=run_info,
diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py
index 4ceefda8d..e3d7c308a 100644
--- a/examples/20_basics/example_stacking_ensemble.py
+++ b/examples/20_basics/example_stacking_ensemble.py
@@ -22,14 +22,15 @@
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
+from autoPyTorch.optimizer.utils import autoPyTorchSMBO
 
 ############################################################################
 # Data Loading
 # ============
 X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
 X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-    X[:200],
-    y[:200],
+    X,
+    y,
     random_state=1,
 )
 
@@ -57,10 +58,12 @@
     X_test=X_test.copy(),
     y_test=y_test.copy(),
     dataset_name='Australian',
-    optimize_metric='zero_one_loss',
-    total_walltime_limit=300,
+    optimize_metric='accuracy',
+    total_walltime_limit=1000,
     func_eval_time_limit_secs=50,
-    enable_traditional_pipeline=False
+    enable_traditional_pipeline=False,
+    smbo_class=autoPyTorchSMBO,
+    all_supported_metrics=False
 )
 
 ############################################################################

From 4f8289c678eace59b7c00ca2f5dbb2fe4d9c57dd Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 17:52:07 +0200
Subject: [PATCH 11/16] removed finished TODO comments and fix
 run_history_updater (clean)

---
 autoPyTorch/api/base_task.py                  | 13 ++-
 autoPyTorch/ensemble/stacking_ensemble.py     | 17 +---
 .../ensemble/stacking_ensemble_builder.py     | 21 ++---
 autoPyTorch/evaluation/stacking_evaluator.py  | 11 +--
 autoPyTorch/optimizer/run_history_callback.py | 91 +++++++------------
 autoPyTorch/optimizer/utils.py                | 15 ++-
 6 files changed, 63 insertions(+), 105 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 3366c6bad..b4ba01c99 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1250,7 +1250,6 @@ def _search(
             self._logger.info("Not starting ensemble builder as ensemble size is 0")
         else:
             self._logger.info("Starting ensemble")
-            ensemble_task_name = 'ensemble'
             proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
                                                         ensemble_size=self.ensemble_size,
                                                         ensemble_nbest=self.ensemble_nbest,
@@ -1258,12 +1257,15 @@ def _search(
                                                         optimize_metric=self.opt_metric,
                                                         ensemble_method=self.ensemble_method
                                                         )
+
+        smac_initial_num_run = self._backend.get_next_num_run(peek=True)
+
         proc_runhistory_updater = None
         if (
             self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble
             and smbo_class is not None
         ):
-            proc_runhistory_updater = self._init_result_history_updater()
+            proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run)
 
         # ==> Run SMAC
         smac_task_name: str = 'runSMAC'
@@ -1302,7 +1304,7 @@ def _search(
                 logger_port=self._logger_port,
                 # We do not increase the num_run here, this is something
                 # smac does internally
-                start_num_run=self._backend.get_next_num_run(peek=True),
+                start_num_run=smac_initial_num_run,
                 search_space_updates=self.search_space_updates,
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
@@ -1943,7 +1945,7 @@ def _collect_results_ensemble(
             pd.DataFrame(self.ensemble_performance_history).to_json(
                 os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
 
-    def _init_result_history_updater(self):
+    def _init_result_history_updater(self, initial_num_run: int) -> RunHistoryUpdaterManager:
         if self.dataset is None:
             raise ValueError("runhistory updater can only be initialised after or during `search()`. "
                              "Please call the `search()` method of {}.".format(self.__class__.__name__))
@@ -1957,7 +1959,8 @@ def _init_result_history_updater(self):
             dataset_name=self.dataset_name,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-            logger_port=self._logger_port
+            logger_port=self._logger_port,
+            initial_num_run=initial_num_run
         )
 
         self._stopwatch.stop_task(runhistory_task_name)
diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
index 913a3024f..a0acc9015 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -11,10 +11,6 @@
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
 
 
-# TODO: for now we can use this and pass this to stacking evaluator.
-# TODO: This can be achieved by using `backend.load_ensemble`
-# TODO: it loads the last stored ensemble. So we have access to it.
-# TODO: the ensemble is a pickled file containing the fitted ensemble of this class.
 # TODO: Think of functionality of the functions in this class adjusted for stacking.
 class StackingEnsemble(AbstractEnsemble):
     def __init__(
@@ -153,7 +149,6 @@ def _calculate_weights(self) -> None:
 
         self.weights_ = weights
 
-    # TODO: Adjust this to use weights and make 
     def predict(self, predictions: List[np.ndarray]) -> np.ndarray:
         return self._predict(predictions, self.weights_)
 
@@ -191,15 +186,9 @@ def _predict(self, predictions, weights):
         del tmp_predictions
         return average
 
-    # def __str__(self) -> str:
-    #     return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
-    #            '\n\tWeights: %s\n\tIdentifiers: %s' % \
-    #            (' '.join(['%d: %5f' % (idx, performance)
-    #                      for idx, performance in enumerate(self.trajectory_)]),
-    #             self.indices_, self.weights_,
-    #             ' '.join([str(identifier) for idx, identifier in
-    #                       enumerate(self.identifiers_)
-    #                       if self.weights_[idx] > 0]))
+    def __str__(self) -> str:
+        return f"Ensemble Selection:\n\tWeights: {self.weights_}\
+            \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.identifiers_) if self.weights_[idx] > 0])}"
 
     def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
         """
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 9b324af8b..4aa96440b 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -27,7 +27,6 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
-# TODO: think of what functions are needed to support stacking
 # TODO: make functions to support stacking.
 class StackingEnsembleBuilder(EnsembleBuilder):
     def __init__(
@@ -119,8 +118,7 @@ def __init__(
         self.ensemble_identifiers: Optional[List[Optional[str]]] = None
         self.read_losses = {}
 
-    # TODO: This is the main wrapper to the EnsembleSelection class which fits
-    # TODO: the ensemble
+    # This is the main wrapper to the EnsembleSelection class which fits the ensemble
     def main(
         self, time_left: float, iteration: int, return_predictions: bool,
     ) -> Tuple[
@@ -209,7 +207,7 @@ def main(
         # reduces selected models if file reading failed
         candidate_models = self.get_test_preds(selected_keys=candidate_models)
 
-        self.logger.debug(f"n_sel_test: {candidate_models}")
+        # self.logger.debug(f"n_sel_test: {candidate_models}")
 
         if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'):
             for candidate in candidate_models:
@@ -218,7 +216,7 @@ def main(
         # as candidate models is sorted in `get_n_best_preds`
         best_model_identifier = candidate_models[0]
 
-        self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}")
+        # self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}")
 
         # train ensemble
         ensemble = self.fit_ensemble(
@@ -229,7 +227,7 @@ def main(
         if ensemble is not None and self.SAVE2DISC:
             self.backend.save_ensemble(ensemble, iteration, self.seed)
             ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_)
-            self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}")
+            # self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}")
             self._save_ensemble_identifiers(
                 ensemble_identifiers=ensemble_identifiers
                 )
@@ -272,10 +270,6 @@ def main(
         else:
             return self.ensemble_history, self.ensemble_nbest, None, None
 
-    # TODO: change this function, to compute loss according to Lavesque et al.
-    # TODO: this will help us in choosing the model with the lowest ensemble error.
-    # TODO: predictions on ensemble set will be available in read_preds to be used for
-    # TODO: passing to stacking_ensemble_builder.predict()
     def compute_ensemble_loss_per_model(self) -> bool:
         """
             Compute the loss of the predictions on ensemble building data set;
@@ -327,7 +321,6 @@ def compute_ensemble_loss_per_model(self) -> bool:
         # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
         # as a returning list, so as a work-around we skip next line
         for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
-            self.logger.debug(f"This is for model {y_ens_fn}")
             if self.read_at_most and n_read_files >= self.read_at_most:
                 # limit the number of files that will be read
                 # to limit memory consumption
@@ -595,13 +588,13 @@ def get_ensemble_loss_with_model(self,
         """
 
         # self.logger.debug(f"in ensemble_loss predictions for current are \n{model_predictions}")
-        self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}")
+        # self.logger.debug(f"in ensemble_loss ensemble_identifiers: {ensemble_identifiers}")
 
         average_predictions = np.zeros_like(model_predictions, dtype=np.float64)
         tmp_predictions = np.empty_like(model_predictions, dtype=np.float64)
         nonnull_identifiers = len([identifier for identifier in ensemble_identifiers if identifier is not None])
 
-        self.logger.debug(f"non null identifiers : {nonnull_identifiers}")
+        # self.logger.debug(f"non null identifiers : {nonnull_identifiers}")
         weight = 1. / float(nonnull_identifiers)
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
@@ -626,7 +619,7 @@ def get_ensemble_loss_with_model(self,
         return loss
 
     def _get_ensemble_identifiers_filename(self):
-        return os.path.join(self.backend.temporary_directory, 'ensemble_identifiers.pkl')
+        return os.path.join(self.backend.internals_directory, 'ensemble_identifiers.pkl')
 
     def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None:
         with open(self._get_ensemble_identifiers_filename(), "wb") as file:
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index d01c846b0..15efac5b9 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -137,10 +137,6 @@ def __init__(self, backend: Backend, queue: Queue,
             search_space_updates=search_space_updates
         )
 
-        # TODO: we cant store the ensemble pipelines with this class as it is initialised for every TAE (target algorithm evaluation).
-        # TODO: Therefore we will have to store pipelines using datamanager and load them, see if we only need predictions.
-        # TODO: but we will need the whole pipeline as we would like to predict with different dataset, like val or something
-
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
@@ -364,11 +360,6 @@ def _predict(self, pipeline: BaseEstimator,
                  test_indices: Union[np.ndarray, List],
                  train_indices: Union[np.ndarray, List]
                  ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
-        # TODO: load ensemble members and predict using the whole ensemble.
-        # TODO: we need some function to pass this pipeline to the last stored ensemble replace 
-        # TODO: model j, where j = ensemble.iteration mod m. then we need to predict
-        # TODO: Also, we will pass the predictions from this pipeline as that is what is needed
-        # TODO: to create the ensemble.
         train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
                                            self.y_train[train_indices])
 
@@ -383,7 +374,7 @@ def _predict(self, pipeline: BaseEstimator,
         else:
             ensemble_opt_pred = pipeline_opt_pred.copy()
 
-        self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
+        # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
         if self.X_valid is not None:
             valid_pred = self.predict_function(self.X_valid, pipeline,
                                                self.y_valid)
diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py
index 6b020e174..376478813 100644
--- a/autoPyTorch/optimizer/run_history_callback.py
+++ b/autoPyTorch/optimizer/run_history_callback.py
@@ -10,7 +10,9 @@
 
 import dask.distributed
 from distributed.utils import Any
+
 from numpy.random.mtrand import seed
+import numpy as np
 
 from smac.optimizer.smbo import SMBO
 from smac.runhistory.runhistory import RunInfo, RunKey
@@ -28,6 +30,7 @@ class RunHistoryUpdaterManager(AdjustRunHistoryCallback):
     def __init__(
         self,
         backend: Backend,
+        initial_num_run: int,
         dataset_name: str,
         resampling_strategy: Union[
             HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
@@ -59,6 +62,7 @@ def __init__(
         # The last criteria is the number of iterations
         self.iteration = 0
 
+        self.initial_num_run = initial_num_run
         # Keep track of when we started to know when we need to finish!
         self.start_time = time.time()
         self.dataset_name = dataset_name
@@ -67,16 +71,12 @@ def __init__(
 
     def __call__(
         self,
-        smbo: 'SMBO',
-    ) -> None:
-        self.adjust_run_history(smbo.tae_runner.client)
+    ) -> Optional[List[Tuple[RunKey, float]]]:
+        return self.adjust_run_history()
 
     def adjust_run_history(
         self,
-        dask_client: dask.distributed.Client,
-        unit_test: bool = False
-    ) -> None:
-
+    ) -> Optional[List[Tuple[RunKey, float]]]:
         # The second criteria is elapsed time
         elapsed_time = time.time() - self.start_time
 
@@ -85,65 +85,35 @@ def adjust_run_history(
             port=self.logger_port,
         )
 
-        if len(self.futures) != 0:
-            if self.futures[0].done():
-                result = self.futures.pop().result()
-                if result:
-                    response = result
-                    logger.debug("iteration={} @ elapsed_time={} has response={}".format(
-                        self.iteration,
-                        elapsed_time,
-                        response,
-                    ))
+        logger.info(
+            "Started Ensemble builder job at {} for iteration {}.".format(
+                # Log the client to make sure we
+                # remain connected to the scheduler
+                time.strftime("%Y.%m.%d-%H.%M.%S"),
+                self.iteration,
+            ))
 
-        # Only submit new jobs if the previous ensemble job finished
-        if len(self.futures) == 0:
-
-            # Add the result of the run
-            # On the next while iteration, no references to
-            # ensemble builder object, so it should be garbage collected to
-            # save memory while waiting for resources
-            # Also, notice how ensemble nbest is returned, so we don't waste
-            # iterations testing if the deterministic predictions size can
-            # be fitted in memory
-            try:
-                # Submit a Dask job from this job, to properly
-                # see it in the dask diagnostic dashboard
-                # Notice that the forked ensemble_builder_process will
-                # wait for the below function to be done
-                self.futures.append(
-                    dask_client.submit(
-                        return_run_info_cost,
+        response = return_run_info_cost(
                         backend=self.backend,
                         dataset_name=self.dataset_name,
                         iteration=self.iteration,
                         resampling_strategy=self.resampling_strategy,
                         resampling_strategy_args=self.resampling_strategy_args,
                         logger_port=self.logger_port,
-                        priority=100
-                        )
-                )
-
-                logger.info(
-                    "{}/{} Started Ensemble builder job at {} for iteration {}.".format(
-                        # Log the client to make sure we
-                        # remain connected to the scheduler
-                        self.futures[0],
-                        dask_client,
-                        time.strftime("%Y.%m.%d-%H.%M.%S"),
-                        self.iteration,
-                    ),
+                        initial_num_run=self.initial_num_run
                 )
-                self.iteration += 1
-            except Exception as e:
-                exception_traceback = traceback.format_exc()
-                error_message = repr(e)
-                logger.critical(exception_traceback)
-                logger.critical(error_message)
+        logger.debug("iteration={} @ elapsed_time={} has response={}".format(
+            self.iteration,
+            elapsed_time,
+            response,
+        ))
+        self.iteration += 1
+        return response
 
 
 def return_run_info_cost(
     backend: Backend,
+    initial_num_run: int,
     dataset_name: str,
     resampling_strategy: Union[
         HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
@@ -218,6 +188,7 @@ def return_run_info_cost(
         resampling_strategy=resampling_strategy,
         resampling_strategy_args=resampling_strategy_args,
         logger_port=logger_port,
+        initial_num_run=initial_num_run
     ).run(
         iteration=iteration,
     )
@@ -228,6 +199,7 @@ class RunHistoryUpdater:
     def __init__(
         self,
         backend: Backend,
+        initial_num_run: int,
         dataset_name: str,
         resampling_strategy: Union[
             HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes
@@ -250,6 +222,7 @@ def __init__(
         """
 
         self.model_fn_re = re.compile(MODEL_FN_RE)
+        self.initial_num_run = initial_num_run
         self.logger_port = logger_port
         self.logger = get_named_client_logger(
             name='RunHistoryUpdater',
@@ -266,27 +239,27 @@ def __init__(
 
     def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]:
         self.logger.info(f"Starting iteration {iteration} of run history updater")
-        results: List[Tuple[RunInfo, float]] = []
+        results: List[Tuple[RunKey, float]] = []
         if os.path.exists(self.ensemble_loss_file):
             try:
                 with (open(self.ensemble_loss_file, "rb")) as memory:
                     read_losses = pickle.load(memory)
             except Exception as e:
                 self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}")
-                return
+                return None
             else:
                 for k in read_losses.keys():
                     match = self.model_fn_re.search(k)
-                    if match is None or read_losses[k]["loaded"] != 1:
+                    if match is None or not np.isfinite(read_losses[k]["ens_loss"]):
                         continue
                     else:
                         _num_run = int(match.group(2))
                         _budget = float(match.group(3))
                         run_key = RunKey(
                             seed=0,  # 0 is hardcoded for the runhistory coming from smac
-                            config_id=_num_run,
+                            config_id=_num_run - self.initial_num_run,
                             budget=_budget,
-                            instance_id=self.instances[-1]
+                            instance_id=self.instances[-1][-1]
                         )
                         results.append((run_key, read_losses[k]["ens_loss"]))
         return results
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index b15cf2580..c44252021 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -145,15 +145,24 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef
                 self._stop = True
 
         for callback in self._callbacks['_adjust_run_history']:
-            response = callback(smbo=self)
+            response = callback()
             if response is not None:
                 for run_key, cost in response:
                     run_value = self.runhistory.data.get(run_key, None)
                     if run_value is not None:
-                        run_value.cost = cost
+                        self.logger.debug(f"updated run_key: {run_key} with cost: {cost}")
+                        updated_run_value = RunValue(
+                            cost,
+                            run_value.time,
+                            run_value.status,
+                            run_value.starttime,
+                            run_value.endtime,
+                            run_value.additional_info
+                            )
+                        self.runhistory.data[run_key] = updated_run_value
                 self.epm_chooser.runhistory = self.runhistory
 
-        self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}")
+        # self.logger.debug(f"\nafter runhistory updater, result: {result}, \nrunhistory: {dict_repr(self.runhistory.data)}")
 
         # Update the intensifier with the result of the runs
         self.incumbent, inc_perf = self.intensifier.process_results(

From 8a7e897fbf061eb1677ff69bfee6b8d2e1ab00d5 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 18:01:14 +0200
Subject: [PATCH 12/16] add possibility of normalised margin loss  (clean)

---
 autoPyTorch/ensemble/stacking_ensemble.py     | 18 ++++++++----
 .../ensemble/stacking_ensemble_builder.py     | 23 +++++++++++++--
 autoPyTorch/evaluation/stacking_evaluator.py  | 28 +++++++++++--------
 3 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/stacking_ensemble.py
index a0acc9015..40ca5bc98 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/stacking_ensemble.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import warnings
 
 import numpy as np
@@ -74,7 +74,7 @@ def fit(
     # TODO: fit a stacked ensemble.
     def _fit(
         self,
-        predictions: List[np.ndarray],
+        predictions: List[Optional[np.ndarray]],
         labels: np.ndarray,
     ) -> None:
         """
@@ -125,7 +125,7 @@ def _fit(
         )[self.metric.name]
 
         # store list of preds for later use
-        self.ensemble_predictions = predictions
+        self.ensemble_predictions_ = predictions
 
         self.train_loss_: float = loss
 
@@ -221,17 +221,25 @@ def predict_with_current_pipeline(
         where m is ensemble_size.
         returns ensemble predictions
         """
-        predictions = self.ensemble_predictions.copy()
+        predictions = self.ensemble_predictions_.copy()
         if predictions[self.ensemble_slot_j] is None:
             total_predictions = len([pred for pred in predictions if pred is not None])
             total_predictions += 1
-            weights = [1/total_predictions for pred in predictions if pred is not None]
+            weights: np.ndarray = np.ndarray([1/total_predictions if pred is not None else 0 for pred in predictions])
         else:
             weights = self.weights_
 
         predictions[self.ensemble_slot_j] = pipeline_predictions
         return self._predict(predictions, weights)
 
+    def get_ensemble_predictions_with_current_pipeline(
+        self,
+        pipeline_predictions: np.ndarray
+    ) -> List[Optional[np.ndarray]]:
+        predictions = self.ensemble_predictions_.copy()
+        predictions[self.ensemble_slot_j] = pipeline_predictions
+        return predictions
+
     def get_models_with_weights(
         self,
         models: Dict[Any, BasePipeline]
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 4aa96440b..17689657d 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -20,6 +20,8 @@
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score
 from autoPyTorch.utils.logging_ import get_named_client_logger
+from autoPyTorch.metrics import zero_one_loss
+
 
 Y_ENSEMBLE = 0
 Y_TEST = 1
@@ -27,6 +29,19 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
+def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float:
+    nonnull_preds = 0
+    margin: float = 0
+    for pred in ensemble_predictions:
+        if pred is not None:
+            nonnull_preds += 1
+            margin += (1 - 2*zero_one_loss(y_true, pred))
+
+    margin /= nonnull_preds
+
+    return pow((1-margin), 2)/4
+
+
 # TODO: make functions to support stacking.
 class StackingEnsembleBuilder(EnsembleBuilder):
     def __init__(
@@ -185,8 +200,9 @@ def main(
         )
 
         self.ensemble_slot_j = np.mod(iteration, self.ensemble_size)
-        self.logger.debug(f"Iteration for ensemble building:{iteration}")
         self.ensemble_identifiers = self._load_ensemble_identifiers()
+        self.logger.debug(f"Iteration for ensemble building:{iteration}, "
+                          f"current model to be updated: {self.ensemble_identifiers[self.ensemble_slot_j]} at slot : {self.ensemble_slot_j}")
         # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
         if not self.compute_ensemble_loss_per_model():
             if return_predictions:
@@ -366,7 +382,7 @@ def compute_ensemble_loss_per_model(self) -> bool:
                     )
 
                 self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
-
+                # self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"]
                 # It is not needed to create the object here
                 # To save memory, we just compute the loss.
                 self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn)
@@ -598,6 +614,7 @@ def get_ensemble_loss_with_model(self,
         weight = 1. / float(nonnull_identifiers)
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
+        ensemble_predictions = list()
         for identifier in ensemble_identifiers:
             if identifier is not None:
                 if self.read_preds[identifier][Y_ENSEMBLE] is None:
@@ -607,6 +624,7 @@ def get_ensemble_loss_with_model(self,
             else:
                 break
 
+            ensemble_predictions.append(predictions)
             np.multiply(predictions, weight, out=tmp_predictions)
             np.add(average_predictions, tmp_predictions, out=average_predictions)
 
@@ -616,6 +634,7 @@ def get_ensemble_loss_with_model(self,
                 prediction=average_predictions,
                 task_type=self.task_type,
             )
+        # loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble)
         return loss
 
     def _get_ensemble_identifiers_filename(self):
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index 15efac5b9..e3d77534e 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -11,13 +11,12 @@
 
 from smac.tae import StatusType
 
-from autoPyTorch import ensemble
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
     MULTICLASSMULTIOUTPUT,
 )
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
+from autoPyTorch.datasets.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss
 from autoPyTorch.evaluation.abstract_evaluator import (
     AbstractEvaluator,
     fit_and_suppress_warnings
@@ -175,6 +174,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             return self.duration, loss_, self.seed, additional_run_info_
 
         cost = loss[self.metric.name]
+        # cost = loss["ensemble_opt_loss"]
 
         additional_run_info = (
             {} if additional_run_info is None else additional_run_info
@@ -297,7 +297,8 @@ def fit_predict_and_loss(self) -> None:
             y_pipeline_opt_pred,
             y_ensemble_opt_pred,
             y_valid_pred,
-            y_test_pred
+            y_test_pred,
+            y_ensemble_preds
         ) = self._fit_and_predict(pipeline, split_id,
                                   train_indices=train_split,
                                   test_indices=test_split)
@@ -305,6 +306,7 @@ def fit_predict_and_loss(self) -> None:
         train_loss = self._loss(self.y_train[train_split], y_train_pred)
         loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred)
 
+        # loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split])
         additional_run_info = pipeline.get_additional_run_info() if hasattr(
             pipeline, 'get_additional_run_info') else {}
 
@@ -334,7 +336,7 @@ def _fit_and_predict(
         fold: int,
         train_indices: Union[np.ndarray, List],
         test_indices: Union[np.ndarray, List],
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
 
         # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
         # about fit_dictionary
@@ -346,7 +348,7 @@ def _fit_and_predict(
         y = None
         fit_and_suppress_warnings(self.logger, pipeline, X, y)
         self.logger.info("Model fitted, now predicting")
-        Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred = self._predict(
+        Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds = self._predict(
             pipeline,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -354,12 +356,14 @@ def _fit_and_predict(
 
         self.pipeline = pipeline
 
-        return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred
+        return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds
 
-    def _predict(self, pipeline: BaseEstimator,
-                 test_indices: Union[np.ndarray, List],
-                 train_indices: Union[np.ndarray, List]
-                 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+    def _predict(
+        self,
+        pipeline: BaseEstimator,
+        test_indices: Union[np.ndarray, List],
+        train_indices: Union[np.ndarray, List]
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
         train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
                                            self.y_train[train_indices])
 
@@ -371,8 +375,10 @@ def _predict(self, pipeline: BaseEstimator,
             old_ensemble = self.backend.load_ensemble(self.seed)
             assert isinstance(old_ensemble, StackingEnsemble)
             ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred)
+            ensemble_preds = old_ensemble.get_ensemble_predictions_with_current_pipeline(pipeline_opt_pred)
         else:
             ensemble_opt_pred = pipeline_opt_pred.copy()
+            ensemble_preds = [pipeline_opt_pred]
 
         # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
         if self.X_valid is not None:
@@ -387,7 +393,7 @@ def _predict(self, pipeline: BaseEstimator,
         else:
             test_pred = None
 
-        return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred
+        return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred, ensemble_preds
 
 
 # create closure for evaluating an algorithm

From 26ff2a404285f5f7912eb0a1d14d9c6dd86f2b47 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 18:13:42 +0200
Subject: [PATCH 13/16] add option for use ensemble loss and minor fixes

---
 autoPyTorch/api/base_task.py                  | 15 ++++++++----
 autoPyTorch/api/tabular_classification.py     |  6 +++--
 .../ensemble/ensemble_builder_manager.py      |  8 +++++--
 .../ensemble/stacking_ensemble_builder.py     | 23 +++++++++----------
 autoPyTorch/evaluation/abstract_evaluator.py  |  5 +++-
 autoPyTorch/evaluation/stacking_evaluator.py  | 19 ++++++++-------
 autoPyTorch/evaluation/tae.py                 |  7 ++++--
 autoPyTorch/evaluation/train_evaluator.py     | 10 +++++---
 autoPyTorch/optimizer/smbo.py                 |  8 +++++--
 9 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index b4ba01c99..b8e2af296 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -48,7 +48,7 @@
     NoResamplingStrategyTypes,
     ResamplingStrategies,
 )
-from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
+from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
@@ -649,9 +649,10 @@ def _load_models(self) -> bool:
 
         if self.ensemble_:
             identifiers = self.ensemble_.get_selected_model_identifiers()
-            self.models_ = self._backend.load_models_by_identifiers(identifiers)
+            nonnull_identifiers = [i for i in identifiers if i is not None]
+            self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers)
             if isinstance(self.resampling_strategy, CrossValTypes):
-                self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers)
+                self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers)
 
             if isinstance(self.resampling_strategy, CrossValTypes):
                 if len(self.cv_models_) == 0:
@@ -977,7 +978,8 @@ def _search(
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
         dask_client: Optional[dask.distributed.Client] = None,
-        smbo_class: Optional[SMBO] = None
+        smbo_class: Optional[SMBO] = None,
+        use_ensemble_opt_loss: bool = False
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1234,6 +1236,7 @@ def _search(
                                     func_eval_time_limit_secs=func_eval_time_limit_secs)
 
         # ============> Starting ensemble
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
         self.precision = precision
         self.opt_metric = optimize_metric
         elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
@@ -1309,6 +1312,7 @@ def _search(
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
                 smbo_class = smbo_class,
+                use_ensemble_opt_loss=self.use_ensemble_opt_loss,
                 other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None
             )
             try:
@@ -1915,6 +1919,7 @@ def _init_ensemble_builder(
             random_state=self.seed,
             precision=precision,
             logger_port=self._logger_port,
+            use_ensemble_loss=self.use_ensemble_opt_loss
         )
         self._stopwatch.stop_task(ensemble_task_name)
 
@@ -2005,7 +2010,7 @@ def predict(
             joblib.delayed(_pipeline_predict)(
                 models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
             )
-            for identifier in self.ensemble_.get_selected_model_identifiers()
+            for identifier in self.ensemble_.get_selected_model_identifiers() if identifier is not None
         )
 
         if len(all_predictions) == 0:
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 5641c1005..3e6354c03 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -258,7 +258,8 @@ def search(
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
         dataset_compression: Union[Mapping[str, Any], bool] = False,
-        smbo_class: Optional[SMBO] = None
+        smbo_class: Optional[SMBO] = None,
+        use_ensemble_opt_loss=False
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -458,7 +459,8 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
-            smbo_class=smbo_class            
+            smbo_class=smbo_class,
+            use_ensemble_opt_loss=use_ensemble_opt_loss      
         )
 
     def predict(
diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py
index 7c0786bb9..84ef362ba 100644
--- a/autoPyTorch/ensemble/ensemble_builder_manager.py
+++ b/autoPyTorch/ensemble/ensemble_builder_manager.py
@@ -47,6 +47,7 @@ def __init__(
         random_state: int,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         pynisher_context: str = 'fork',
+        use_ensemble_loss=False
     ):
         """ SMAC callback to handle ensemble building
         Args:
@@ -135,6 +136,8 @@ def __init__(
         # Keep track of when we started to know when we need to finish!
         self.start_time = time.time()
 
+        self.use_ensemble_loss = use_ensemble_loss
+
     def __call__(
         self,
         smbo: 'SMBO',
@@ -226,6 +229,7 @@ def build_ensemble(
                     pynisher_context=self.pynisher_context,
                     logger_port=self.logger_port,
                     unit_test=unit_test,
+                    use_ensemble_opt_loss=self.use_ensemble_loss
                 ))
 
                 logger.info(
@@ -268,6 +272,7 @@ def fit_and_return_ensemble(
     pynisher_context: str,
     logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     unit_test: bool = False,
+    use_ensemble_opt_loss=False
 ) -> Tuple[
         List[Dict[str, float]],
         int,
@@ -352,6 +357,7 @@ def fit_and_return_ensemble(
         random_state=random_state,
         logger_port=logger_port,
         unit_test=unit_test,
+        use_ensemble_opt_loss=use_ensemble_opt_loss
     ).run(
         end_at=end_at,
         iteration=iteration,
@@ -359,5 +365,3 @@ def fit_and_return_ensemble(
         pynisher_context=pynisher_context,
     )
     return result
-
-
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 17689657d..01c582410 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -20,7 +20,6 @@
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score
 from autoPyTorch.utils.logging_ import get_named_client_logger
-from autoPyTorch.metrics import zero_one_loss
 
 
 Y_ENSEMBLE = 0
@@ -29,18 +28,17 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
-def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float:
-    nonnull_preds = 0
-    margin: float = 0
+def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) -> float:
+    n_ensemble = 0
+    loss = 0
     for pred in ensemble_predictions:
         if pred is not None:
-            nonnull_preds += 1
-            margin += (1 - 2*zero_one_loss(y_true, pred))
-
-    margin /= nonnull_preds
-
-    return pow((1-margin), 2)/4
+            n_ensemble += 1
+            loss += 1 -2*(y_true != np.argmax(pred, axis=1)).astype(float)
 
+    loss /= n_ensemble
+    margin = np.power(1-loss, 2)/4
+    return np.mean(margin)
 
 # TODO: make functions to support stacking.
 class StackingEnsembleBuilder(EnsembleBuilder):
@@ -63,6 +61,7 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         unit_test: bool = False,
+        use_ensemble_opt_loss=False
     ):
         """
             Constructor
@@ -381,7 +380,7 @@ def compute_ensemble_loss_per_model(self) -> bool:
                     ensemble_identifiers=ensemble_idenitfiers
                     )
 
-                self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
+                self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"] if self.use_ensemble_opt_loss else losses[self.opt_metric]
                 # self.read_losses[y_ens_fn]["ens_loss"] = losses["ensemble_opt_loss"]
                 # It is not needed to create the object here
                 # To save memory, we just compute the loss.
@@ -634,7 +633,7 @@ def get_ensemble_loss_with_model(self,
                 prediction=average_predictions,
                 task_type=self.task_type,
             )
-        # loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble)
+        loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble)
         return loss
 
     def _get_ensemble_identifiers_filename(self):
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 7202096b6..3fcc64889 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -424,7 +424,8 @@ def __init__(self, backend: Backend,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 use_ensemble_opt_loss=False
                  ) -> None:
 
         self.starttime = time.time()
@@ -510,6 +511,8 @@ def __init__(self, backend: Backend,
             port=logger_port,
         )
 
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
+
         self._init_fit_dictionary(logger_port=logger_port, pipeline_config=pipeline_config, metrics_dict=metrics_dict)
         self.Y_optimization: Optional[np.ndarray] = None
         self.Y_actual_train: Optional[np.ndarray] = None
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/stacking_evaluator.py
index e3d77534e..4207e234f 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/stacking_evaluator.py
@@ -16,7 +16,7 @@
     CLASSIFICATION_TASKS,
     MULTICLASSMULTIOUTPUT,
 )
-from autoPyTorch.datasets.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss
+from autoPyTorch.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss
 from autoPyTorch.evaluation.abstract_evaluator import (
     AbstractEvaluator,
     fit_and_suppress_warnings
@@ -115,7 +115,8 @@ def __init__(self, backend: Backend, queue: Queue,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 use_ensemble_opt_loss=False) -> None:
         super().__init__(
             backend=backend,
             queue=queue,
@@ -133,10 +134,11 @@ def __init__(self, backend: Backend, queue: Queue,
             logger_port=logger_port,
             all_supported_metrics=all_supported_metrics,
             pipeline_config=pipeline_config,
-            search_space_updates=search_space_updates
+            search_space_updates=search_space_updates,
+            use_ensemble_opt_loss=use_ensemble_opt_loss
         )
 
-        self.logger.debug("Search space updates :{}".format(self.search_space_updates))
+        self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss))
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   valid_pred: Optional[np.ndarray],
@@ -173,8 +175,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         if loss_ is not None:
             return self.duration, loss_, self.seed, additional_run_info_
 
-        cost = loss[self.metric.name]
-        # cost = loss["ensemble_opt_loss"]
+        cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name]
 
         additional_run_info = (
             {} if additional_run_info is None else additional_run_info
@@ -306,7 +307,7 @@ def fit_predict_and_loss(self) -> None:
         train_loss = self._loss(self.y_train[train_split], y_train_pred)
         loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred)
 
-        # loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split])
+        loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split], self.task_type)
         additional_run_info = pipeline.get_additional_run_info() if hasattr(
             pipeline, 'get_additional_run_info') else {}
 
@@ -415,6 +416,7 @@ def eval_function(
     logger_port: Optional[int] = None,
     all_supported_metrics: bool = True,
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    use_ensemble_opt_loss=False,
     instance: str = None,
 ) -> None:
     """
@@ -496,6 +498,7 @@ def eval_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
+        use_ensemble_opt_loss=use_ensemble_opt_loss
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index c756d5e8e..4ac84c8ef 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -131,7 +131,8 @@ def __init__(
         logger_port: int = None,
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        ensemble_method = None
+        ensemble_method = None,
+        use_ensemble_opt_loss=False
     ):
 
         self.backend = backend
@@ -208,6 +209,7 @@ def __init__(
         self.memory_limit = memory_limit
 
         self.search_space_updates = search_space_updates
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
 
     def _check_and_get_default_budget(self) -> float:
         budget_type_choices = ('epochs', 'runtime')
@@ -346,7 +348,8 @@ def run(
             pipeline_config=self.pipeline_config,
             logger_port=self.logger_port,
             all_supported_metrics=self.all_supported_metrics,
-            search_space_updates=self.search_space_updates
+            search_space_updates=self.search_space_updates,
+            use_ensemble_opt_loss=self.use_ensemble_opt_loss
         )
 
         info: Optional[List[RunValue]]
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 5c937d614..e5884a9f7 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -131,7 +131,8 @@ def __init__(self, backend: Backend, queue: Queue,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 use_ensemble_opt_loss=False) -> None:
         super().__init__(
             backend=backend,
             queue=queue,
@@ -149,7 +150,8 @@ def __init__(self, backend: Backend, queue: Queue,
             logger_port=logger_port,
             all_supported_metrics=all_supported_metrics,
             pipeline_config=pipeline_config,
-            search_space_updates=search_space_updates
+            search_space_updates=search_space_updates,
+            use_ensemble_opt_loss=use_ensemble_opt_loss
         )
 
         if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
@@ -428,6 +430,7 @@ def eval_train_function(
     logger_port: Optional[int] = None,
     all_supported_metrics: bool = True,
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    use_ensemble_opt_loss=False,
     instance: str = None,
 ) -> None:
     """
@@ -509,6 +512,7 @@ def eval_train_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
+        use_ensemble_opt_loss=use_ensemble_opt_loss
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 1238e608d..a4a8ce20e 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -121,7 +121,8 @@ def __init__(self,
                  max_budget: int = 50,
                  ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
                  other_callbacks: Optional[List] = None,
-                 smbo_class: Optional[SMBO] = None
+                 smbo_class: Optional[SMBO] = None,
+                 use_ensemble_opt_loss: bool = False
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -253,6 +254,8 @@ def __init__(self,
                                               port=self.logger_port)
         self.logger.info("initialised {}".format(self.__class__.__name__))
 
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
+
         self.initial_configurations: Optional[List[Configuration]] = None
         if portfolio_selection is not None:
             initial_configurations = read_return_initial_configurations(config_space=config_space,
@@ -303,7 +306,8 @@ def run_smbo(self, func: Optional[Callable] = None
             pipeline_config=self.pipeline_config,
             search_space_updates=self.search_space_updates,
             pynisher_context=self.pynisher_context,
-            ensemble_method=self.ensemble_method
+            ensemble_method=self.ensemble_method,
+            use_ensemble_opt_loss=self.use_ensemble_opt_loss
         )
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")

From 7e7001b36acbc195c598fb7023ec28a14b1a000c Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 18:15:19 +0200
Subject: [PATCH 14/16] final working version of ensemble bayesian learning

---
 autoPyTorch/optimizer/smbo.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index a4a8ce20e..47fb4e619 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -258,11 +258,12 @@ def __init__(self,
 
         self.initial_configurations: Optional[List[Configuration]] = None
         if portfolio_selection is not None:
-            initial_configurations = read_return_initial_configurations(config_space=config_space,
-                                                                        portfolio_selection=portfolio_selection)
-            # incase we dont have any valid configuration from the portfolio
-            self.initial_configurations = initial_configurations \
-                if len(initial_configurations) > 0 else None
+            self.initial_configurations = read_return_initial_configurations(config_space=config_space,
+                                                                             portfolio_selection=portfolio_selection)
+            if len(self.initial_configurations) == 0:
+                self.initial_configurations = None
+                self.logger.warning("None of the portfolio configurations are compatible"
+                                    " with the current search space. Skipping initial configuration...")
 
     def run_smbo(self, func: Optional[Callable] = None
                  ) -> Tuple[RunHistory, List[TrajEntry], str]:

From 32f0d2f82177cba65693764cdcfcd8503a3a33e3 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 26 Apr 2022 19:11:19 +0200
Subject: [PATCH 15/16] minor fix for init use ensemble loss

---
 autoPyTorch/ensemble/stacking_ensemble_builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/stacking_ensemble_builder.py
index 01c582410..5097835f2 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/stacking_ensemble_builder.py
@@ -131,6 +131,7 @@ def __init__(
         # or passing them via the ensemble builder manager which has persistency with the futures stored.
         self.ensemble_identifiers: Optional[List[Optional[str]]] = None
         self.read_losses = {}
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
 
     # This is the main wrapper to the EnsembleSelection class which fits the ensemble
     def main(
@@ -633,7 +634,7 @@ def get_ensemble_loss_with_model(self,
                 prediction=average_predictions,
                 task_type=self.task_type,
             )
-        loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble)
+        loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble, self.task_type)
         return loss
 
     def _get_ensemble_identifiers_filename(self):

From 5c09afcd38cc693247994812f46e7d006fc7e074 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Wed, 22 Jun 2022 12:08:01 +0200
Subject: [PATCH 16/16] Stacking ensemble selection clean (#2)

* add repeated kfold

* working repeated k fold

* working stacking evaluator without changing dataset and no final predict

* replace datamanager

* fix prediction with stack ensembles

* adaptive repeats

* working version of stacking with changing dataset preserving categorical info

* working version of ensemble selection per layer, TODO: send predictions according to the weights associated with the model

* finish previous todo: send predictions according to the weights associated with the model

* working version of base repeat stacked ensembles, todo: check if other methods still work, add autogluon stacking

* working all stacking versions

* rename optimisation stacking ensemble

* Add autogluon stacking  (#1)

* add working traditional models according autofluon

* working pytorch embedding with skew and embed column splitting

* work in progress: autogluon ensembling

* working autogluon ensemble

* important fix for more than 2 stacking layers

* fix for running more than 2 stacking layers

* working autogluon with default nn config from autogluon

* working xgboost model

* add configurationspace to traditional classification models

* working autogluon stacking and stacking optimisation, todo: search for autogluon models and post hoc ensemble selection for ensemble optimisation

* added post fit ensemble optimization, working per layer selection, repeat models, stacking optimisation

* update config space for search, fix stratified resampling, fix printing model with weights for soe

* fix running traditional pipeline for all the ensembles, fix get config from run history

* fix cut off num run for all ensembles

* __init__ file for column splittin

* all requirements

* add __init__.py for trad ml

* pass smbo class to custom callback

* early stop also ensemble opt

* remove -1 from autogluon stacking

* reduce number of models stored after stcking

* fix issue wiuth null identifiers in selected ensemble identifiers

* remove pointless line for debug

* set multiprocessing context to forkserver for n workers 1

* fix error when all repeats do not finish

* examples changed
---
 autoPyTorch/api/base_task.py                  | 1114 ++++++++++++-----
 autoPyTorch/api/tabular_classification.py     |   64 +-
 autoPyTorch/api/tabular_regression.py         |   10 +-
 autoPyTorch/api/utils.py                      |  139 ++
 autoPyTorch/data/base_feature_validator.py    |    2 +-
 autoPyTorch/data/tabular_feature_validator.py |   12 +-
 autoPyTorch/data/tabular_validator.py         |    7 +-
 autoPyTorch/datasets/base_dataset.py          |   91 +-
 autoPyTorch/datasets/resampling_strategy.py   |   91 +-
 autoPyTorch/datasets/tabular_dataset.py       |    7 +-
 autoPyTorch/datasets/utils.py                 |   48 +
 .../ensemble/autogluon_stacking_ensemble.py   |  158 +++
 autoPyTorch/ensemble/ensemble_builder.py      |   14 +-
 .../ensemble/ensemble_builder_manager.py      |   66 +-
 ...nsemble_optimisation_stacking_ensemble.py} |   69 +-
 ...optimisation_stacking_ensemble_builder.py} |  173 ++-
 ...e_selection_per_layer_stacking_ensemble.py |  145 +++
 ...ion_per_layer_stacking_ensemble_builder.py |  620 +++++++++
 .../repeat_models_stacking_ensemble.py        |  179 +++
 autoPyTorch/ensemble/utils.py                 |   26 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |   65 +-
 .../ensemble_optimisation_evaluator.py        |  648 ++++++++++
 ...ator.py => repeated_crossval_evaluator.py} |  282 +++--
 autoPyTorch/evaluation/tae.py                 |   39 +-
 autoPyTorch/evaluation/train_evaluator.py     |   24 +-
 autoPyTorch/evaluation/utils.py               |    9 +
 autoPyTorch/optimizer/run_history_callback.py |    1 +
 autoPyTorch/optimizer/smbo.py                 |  148 ++-
 autoPyTorch/optimizer/utils.py                |   11 +-
 autoPyTorch/pipeline/base_pipeline.py         |   27 -
 .../TabularColumnTransformer.py               |   20 +-
 .../base_tabular_preprocessing.py             |    5 +-
 .../column_splitting/ColumnSplitter.py        |  100 ++
 .../column_splitting/__init__.py              |    0
 .../encoding/NoEncoder.py                     |   12 -
 .../encoding/OneHotEncoder.py                 |   11 +-
 .../encoding/base_encoder.py                  |   10 +-
 .../scaling/MinMaxScaler.py                   |    3 +-
 .../tabular_preprocessing/scaling/NoScaler.py |   14 -
 .../scaling/Normalizer.py                     |    3 +-
 .../scaling/RobustScaler.py                   |    3 +-
 .../scaling/StandardScaler.py                 |    3 +-
 .../scaling/base_scaler.py                    |    9 +-
 .../skew_transformer/NoSkewTransformer.py     |   42 +
 .../PowerTransformer.py                       |    9 +-
 .../QuantileTransformer.py                    |    9 +-
 .../skew_transformer/__init__.py              |  143 +++
 .../skew_transformer/base_skew_transformer.py |   33 +
 .../tabular_preprocessing/utils.py            |    9 +-
 .../early_preprocessor/EarlyPreprocessing.py  |    5 +-
 .../components/setup/network/base_network.py  |    1 -
 .../setup/network_backbone/MLPBackbone.py     |   10 +-
 .../network_backbone/base_network_backbone.py |    6 +-
 .../setup/network_backbone/utils.py           |    5 +-
 .../LearnedEntityEmbedding.py                 |  109 +-
 .../setup/network_embedding/NoEmbedding.py    |    2 +-
 .../base_network_embedding.py                 |   42 +-
 .../setup/traditional_ml/base_model.py        |   32 +-
 .../estimator_configs/catboost.json           |    4 -
 .../estimator_configs/extra_trees.json        |    3 -
 .../traditional_ml/estimator_configs/knn.json |    3 -
 .../traditional_ml/estimator_configs/lgb.json |    9 -
 .../estimator_configs/random_forest.json      |    3 -
 .../estimator_configs/rotation_forest.json    |    2 -
 .../traditional_ml/estimator_configs/svm.json |    4 -
 .../tabular_traditional_model.py              |   65 +-
 .../traditional_learner/__init__.py           |   27 +-
 .../base_traditional_learner.py               |   27 +-
 .../traditional_learner/catboost/__init__.py  |    0
 .../traditional_learner/catboost/catboost.py  |  142 +++
 .../traditional_learner/catboost/utils.py     |  138 ++
 .../extratrees/__init__.py                    |    0
 .../extratrees/extratrees.py                  |   99 ++
 .../traditional_learner/extratrees/utils.py   |    7 +
 .../traditional_learner/knn/__init__.py       |    0
 .../traditional_learner/knn/knn.py            |  108 ++
 .../traditional_learner/knn/utils.py          |    8 +
 .../traditional_learner/learners.py           |  361 ------
 .../traditional_learner/lgbm/__init__.py      |    0
 .../traditional_learner/lgbm/lgbm.py          |  153 +++
 .../traditional_learner/lgbm/utils.py         |  298 +++++
 .../random_forest/__init__.py                 |    0
 .../random_forest/random_forest.py            |  103 ++
 .../random_forest/utils.py                    |    9 +
 .../traditional_learner/utils.py              |   15 -
 .../traditional_learner/xgboost/__init__.py   |    0
 .../xgboost/early_stopping_custom.py          |   90 ++
 .../traditional_learner/xgboost/utils.py      |   85 ++
 .../traditional_learner/xgboost/xgboost.py    |  198 +++
 .../training/data_loader/base_data_loader.py  |   10 +-
 .../components/training/trainer/__init__.py   |   16 +-
 .../pipeline/tabular_classification.py        |    8 +-
 .../traditional_tabular_classification.py     |   22 +-
 autoPyTorch/utils/common.py                   |   44 +-
 autoPyTorch/utils/data_classes.py             |   27 +
 autoPyTorch/utils/early_stopping.py           |   47 +
 autoPyTorch/utils/parallel_model_runner.py    |  167 +++
 .../20_basics/example_autogluon_ensemble.py   |  105 ++
 .../20_basics/example_stacking_ensemble.py    |  119 +-
 ...xample_stacking_ensemble_selection_base.py |  109 ++
 ...e_stacking_ensemble_selection_per_layer.py |  107 ++
 .../example_tabular_classification.py         |   51 +-
 requirements.txt                              |   10 +-
 103 files changed, 6480 insertions(+), 1262 deletions(-)
 create mode 100644 autoPyTorch/api/utils.py
 create mode 100644 autoPyTorch/datasets/utils.py
 create mode 100644 autoPyTorch/ensemble/autogluon_stacking_ensemble.py
 rename autoPyTorch/ensemble/{stacking_ensemble.py => ensemble_optimisation_stacking_ensemble.py} (80%)
 rename autoPyTorch/ensemble/{stacking_ensemble_builder.py => ensemble_optimisation_stacking_ensemble_builder.py} (77%)
 create mode 100644 autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py
 create mode 100644 autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py
 create mode 100644 autoPyTorch/ensemble/repeat_models_stacking_ensemble.py
 create mode 100644 autoPyTorch/evaluation/ensemble_optimisation_evaluator.py
 rename autoPyTorch/evaluation/{stacking_evaluator.py => repeated_crossval_evaluator.py} (62%)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py
 rename autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/{scaling => skew_transformer}/PowerTransformer.py (76%)
 rename autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/{scaling => skew_transformer}/QuantileTransformer.py (90%)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py
 create mode 100644 autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py
 create mode 100644 autoPyTorch/utils/data_classes.py
 create mode 100644 autoPyTorch/utils/early_stopping.py
 create mode 100644 autoPyTorch/utils/parallel_model_runner.py
 create mode 100644 examples/20_basics/example_autogluon_ensemble.py
 create mode 100644 examples/20_basics/example_stacking_ensemble_selection_base.py
 create mode 100644 examples/20_basics/example_stacking_ensemble_selection_per_layer.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index b8e2af296..fa4998917 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -8,6 +8,7 @@
 import sys
 import tempfile
 import time
+from turtle import pos
 import typing
 import unittest.mock
 import warnings
@@ -33,6 +34,7 @@
 from smac.tae import StatusType
 
 from autoPyTorch import metrics
+from autoPyTorch.api.utils import get_autogluon_default_nn_config, get_config_from_run_history
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
     REGRESSION_TASKS,
@@ -40,6 +42,7 @@
     STRING_TO_TASK_TYPES,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -47,9 +50,15 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes,
     ResamplingStrategies,
+    RepeatedCrossValTypes
 )
+from autoPyTorch.datasets.utils import get_appended_dataset
+from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.ensemble.repeat_models_stacking_ensemble import RepeatModelsStackingEnsemble
 from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
+from autoPyTorch.ensemble.autogluon_stacking_ensemble import AutogluonStackingEnsemble
+from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble
 from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
@@ -60,7 +69,8 @@
 from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score, get_metrics
-from autoPyTorch.utils.common import FitRequirement, dict_repr, replace_string_bool_to_bool
+from autoPyTorch.utils.common import FitRequirement, ENSEMBLE_ITERATION_MULTIPLIER, dict_repr, replace_string_bool_to_bool, validate_config
+from autoPyTorch.utils.parallel_model_runner import run_models_on_dataset
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import (
     PicklableClientLogger,
@@ -172,9 +182,11 @@ def __init__(
         n_jobs: int = 1,
         n_threads: int = 1,
         logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
+        ensemble_size: int = 5,
         ensemble_nbest: int = 50,
-        ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+        ensemble_method: EnsembleSelectionTypes = EnsembleSelectionTypes.ensemble_selection,
+        use_ensemble_opt_loss: bool = False,
+        num_stacking_layers: int = 1,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -185,6 +197,7 @@ def __init__(
         backend: Optional[Backend] = None,
         resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        feat_type: Optional[List[str]] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         task_type: Optional[str] = None
     ) -> None:
@@ -198,6 +211,9 @@ def __init__(
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
         self.ensemble_method = ensemble_method
+        self.num_stacking_layers = num_stacking_layers
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
+
         self.max_models_on_disc = max_models_on_disc
         self.logging_config: Optional[Dict] = logging_config
         self.include_components: Optional[Dict] = include_components
@@ -230,7 +246,7 @@ def __init__(
         self.precision: Optional[int] = None
         self.opt_metric: Optional[str] = None
         self.dataset: Optional[BaseDataset] = None
-
+        self.ensemble_ = None
         self._results_manager = ResultsManager()
 
         # By default try to use the TCP logging port or get a new port
@@ -239,7 +255,7 @@ def __init__(
         # Store the resampling strategy from the dataset, to load models as needed
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
-
+        self.feat_type = feat_type
         self.stop_logging_server: Optional[multiprocessing.synchronize.Event] = None
 
         # Single core, local runs should use fork
@@ -249,8 +265,8 @@ def __init__(
         # possibility of a deadlock
         self._dask_client: Optional[dask.distributed.Client] = None
         self._multiprocessing_context = 'forkserver'
-        if self.n_jobs == 1:
-            self._multiprocessing_context = 'fork'
+        # if self.n_jobs == 1:
+        #     self._multiprocessing_context = 'fork'
 
         self.input_validator: Optional[BaseInputValidator] = None
 
@@ -305,6 +321,7 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        feat_type: Optional[List] = None,
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
@@ -351,6 +368,7 @@ def get_dataset(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        feat_type: Optional[List] = None,
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
@@ -422,7 +440,8 @@ def get_dataset(
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
             dataset_name=dataset_name,
-            dataset_compression=dataset_compression)
+            dataset_compression=dataset_compression,
+            feat_type=feat_type)
 
         return dataset
 
@@ -480,18 +499,31 @@ def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace:
         if self.search_space is not None:
             return self.search_space
         elif dataset is not None:
-            dataset_requirements = get_dataset_requirements(
-                info=dataset.get_required_dataset_info(),
-                include=self.include_components,
-                exclude=self.exclude_components,
+            return self._get_search_space(
+                dataset,
+                include_components=self.include_components,
+                exclude_components=self.exclude_components,
                 search_space_updates=self.search_space_updates)
-            return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements),
-                                           include=self.include_components,
-                                           exclude=self.exclude_components,
-                                           search_space_updates=self.search_space_updates)
         raise ValueError("No search space initialised and no dataset passed. "
                          "Can't create default search space without the dataset")
 
+    @staticmethod
+    def _get_search_space(
+        dataset: BaseDataset,
+        include_components,
+        exclude_components,
+        search_space_updates,
+    ) -> ConfigurationSpace:
+        dataset_requirements = get_dataset_requirements(
+                info=dataset.get_required_dataset_info(),
+                include=include_components,
+                exclude=exclude_components,
+                search_space_updates=search_space_updates)
+        return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements),
+                                           include=include_components,
+                                           exclude=exclude_components,
+                                           search_space_updates=search_space_updates)
+
     def _get_logger(self, name: str) -> PicklableClientLogger:
         """
         Instantiates the logger used throughout the experiment
@@ -649,12 +681,28 @@ def _load_models(self) -> bool:
 
         if self.ensemble_:
             identifiers = self.ensemble_.get_selected_model_identifiers()
-            nonnull_identifiers = [i for i in identifiers if i is not None]
-            self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers)
-            if isinstance(self.resampling_strategy, CrossValTypes):
-                self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers)
+            # nonnull_identifiers = [i for i in identifiers if i is not None]
+            # self.models_ = self._backend.load_models_by_identifiers(nonnull_identifiers)
+            # if isinstance(self.resampling_strategy, CrossValTypes):
+            #     self.cv_models_ = self._backend.load_cv_models_by_identifiers(nonnull_identifiers)
+
+            # self._logger.debug(f"stacked ensemble identifiers are :{identifiers}")
+            if self.ensemble_method.is_stacking_ensemble():
+                models = []
+                cv_models = []
+                for identifier in identifiers:
+                    nonnull_identifiers = [i for i in identifier if i is not None]
+                    models.append(self._backend.load_models_by_identifiers(nonnull_identifiers))
+                    cv_models.append(self._backend.load_cv_models_by_identifiers(nonnull_identifiers))
+                # self._logger.debug(f"stacked ensemble models are :{models}")
+                self.models_ = models
+                self.cv_models_ = cv_models
 
-            if isinstance(self.resampling_strategy, CrossValTypes):
+            else:
+                self.models_ = self._backend.load_models_by_identifiers(identifiers)
+                if isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)):
+                    self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers)
+            if isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)):
                 if len(self.cv_models_) == 0:
                     raise ValueError('No models fitted!')
 
@@ -765,7 +813,8 @@ def _do_dummy_prediction(self) -> None:
             memory_limit=memory_limit,
             disable_file_output=self._disable_file_output,
             all_supported_metrics=self._all_supported_metrics,
-            ensemble_method=self.ensemble_method
+            ensemble_method=self.ensemble_method,
+            pipeline_config=self.pipeline_options
         )
 
         status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -814,113 +863,33 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
         assert self._dask_client is not None
 
         self._logger.info("Starting to create traditional classifier predictions.")
-        starttime = time.time()
 
-        # Initialise run history for the traditional classifiers
-        run_history = RunHistory()
-        memory_limit = self._memory_limit
-        if memory_limit is not None:
-            memory_limit = int(math.ceil(memory_limit))
-        available_classifiers = get_available_traditional_learners()
-        dask_futures = []
-
-        total_number_classifiers = len(available_classifiers)
-        for n_r, classifier in enumerate(available_classifiers):
-
-            # Only launch a task if there is time
-            start_time = time.time()
-            if time_left >= func_eval_time_limit_secs:
-                self._logger.info(f"{n_r}: Started fitting {classifier} with cutoff={func_eval_time_limit_secs}")
-                scenario_mock = unittest.mock.Mock()
-                scenario_mock.wallclock_limit = time_left
-                # This stats object is a hack - maybe the SMAC stats object should
-                # already be generated here!
-                stats = Stats(scenario_mock)
-                stats.start_timing()
-                ta = ExecuteTaFuncWithQueue(
-                    pynisher_context=self._multiprocessing_context,
-                    backend=self._backend,
-                    seed=self.seed,
-                    multi_objectives=["cost"],
-                    metric=self._metric,
-                    logger_port=self._logger_port,
-                    cost_for_crash=get_cost_of_crash(self._metric),
-                    abort_on_first_run_crash=False,
-                    initial_num_run=self._backend.get_next_num_run(),
-                    stats=stats,
-                    memory_limit=memory_limit,
-                    disable_file_output=self._disable_file_output,
-                    all_supported_metrics=self._all_supported_metrics
-                )
-                dask_futures.append([
-                    classifier,
-                    self._dask_client.submit(
-                        ta.run, config=classifier,
-                        cutoff=func_eval_time_limit_secs,
-                    )
-                ])
-
-            # When managing time, we need to take into account the allocated time resources,
-            # which are dependent on the number of cores. 'dask_futures' is a proxy to the number
-            # of workers /n_jobs that we have, in that if there are 4 cores allocated, we can run at most
-            # 4 task in parallel. Every 'cutoff' seconds, we generate up to 4 tasks.
-            # If we only have 4 workers and there are 4 futures in dask_futures, it means that every
-            # worker has a task. We would not like to launch another job until a worker is available. To this
-            # end, the following if-statement queries the number of active jobs, and forces to wait for a job
-            # completion via future.result(), so that a new worker is available for the next iteration.
-            if len(dask_futures) >= self.n_jobs:
-
-                # How many workers to wait before starting fitting the next iteration
-                workers_to_wait = 1
-                if n_r >= total_number_classifiers - 1 or time_left <= func_eval_time_limit_secs:
-                    # If on the last iteration, flush out all tasks
-                    workers_to_wait = len(dask_futures)
-
-                while workers_to_wait >= 1:
-                    workers_to_wait -= 1
-                    # We launch dask jobs only when there are resources available.
-                    # This allow us to control time allocation properly, and early terminate
-                    # the traditional machine learning pipeline
-                    cls, future = dask_futures.pop(0)
-                    status, cost, runtime, additional_info = future.result()
-                    if status == StatusType.SUCCESS:
-                        self._logger.info(
-                            "Fitting {} took {} [sec] and got performance: {}.\n"
-                            "additional info:\n{}".format(cls, runtime, cost, dict_repr(additional_info))
-                        )
-                        configuration = additional_info['pipeline_configuration']
-                        origin = additional_info['configuration_origin']
-                        additional_info.pop('pipeline_configuration')
-                        run_history.add(config=configuration, cost=cost,
-                                        time=runtime, status=status, seed=self.seed,
-                                        starttime=starttime, endtime=starttime + runtime,
-                                        origin=origin, additional_info=additional_info)
-                    else:
-                        if additional_info.get('exitcode') == -6:
-                            self._logger.error(
-                                "Traditional prediction for {} failed with run state {},\n"
-                                "because the provided memory limits were too tight.\n"
-                                "Please increase the 'ml_memory_limit' and try again.\n"
-                                "If you still get the problem, please open an issue\n"
-                                "and paste the additional info.\n"
-                                "Additional info:\n{}".format(cls, str(status), dict_repr(additional_info))
-                            )
-                        else:
-                            self._logger.error(
-                                "Traditional prediction for {} failed with run state {}.\nAdditional info:\n{}".format(
-                                    cls, str(status), dict_repr(additional_info)
-                                )
-                            )
-
-            # In the case of a serial execution, calling submit halts the run for a resource
-            # dynamically adjust time in this case
-            time_left -= int(time.time() - start_time)
-
-            # Exit if no more time is available for a new classifier
-            if time_left < func_eval_time_limit_secs:
-                self._logger.warning("Not enough time to fit all traditional machine learning models."
-                                     "Please consider increasing the run time to further improve performance.")
-                break
+        available_classifiers = get_available_traditional_learners(dataset_properties=self._get_dataset_properties(self.dataset))
+        model_configs = [(key, self.pipeline_options[self.pipeline_options['budget_type']]) for key in available_classifiers.keys()]
+        
+        run_history, _ = run_models_on_dataset(
+            time_left=time_left,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            model_configs=model_configs,
+            logger=self._logger,
+            logger_port=self._logger_port,
+            metric=self._metric,
+            dask_client=self._dask_client,
+            backend=self._backend,
+            memory_limit=self._memory_limit,
+            disable_file_output=self._disable_file_output,
+            all_supported_metrics=self._all_supported_metrics,
+            ensemble_method=self.ensemble_method,
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=self.search_space_updates,
+            pipeline_options=self.pipeline_options,
+            seed=self.seed,
+            multiprocessing_context=self._multiprocessing_context,
+            n_jobs=self.n_jobs,
+            current_search_space=self.search_space,
+            smac_initial_run=self._backend.get_next_num_run()
+        )
 
         self._logger.debug("Run history traditional: {}".format(run_history))
         # add run history of traditional to api run history
@@ -958,6 +927,447 @@ def run_traditional_ml(
         )
         self._stopwatch.stop_task(traditional_task_name)
 
+    def _fit_models_on_dataset(
+        self,
+        model_configs,
+        func_eval_time_limit_secs,
+        stacking_layer,
+        time_left,
+        current_search_space,
+        smac_initial_run,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> List[Tuple]:\
+        
+        search_space_updates = search_space_updates if search_space_updates is not None else self.search_space_updates
+
+        run_history, model_identifiers = run_models_on_dataset(
+            time_left=time_left,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            model_configs=model_configs,
+            logger=self._logger,
+            logger_port=self._logger_port,
+            metric=self._metric,
+            dask_client=self._dask_client,
+            backend=self._backend,
+            memory_limit=self._memory_limit,
+            disable_file_output=self._disable_file_output,
+            all_supported_metrics=self._all_supported_metrics,
+            ensemble_method=self.ensemble_method,
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=search_space_updates,
+            pipeline_options=self.pipeline_options,
+            seed=self.seed,
+            multiprocessing_context=self._multiprocessing_context,
+            n_jobs=self.n_jobs,
+            current_search_space=current_search_space,
+            smac_initial_run=smac_initial_run
+        )
+
+        self._logger.debug("Run history for layer: {}: {}".format(stacking_layer, run_history))
+        # add run history of traditional to api run history
+        self.run_history.update(run_history, DataOrigin.EXTERNAL_SAME_INSTANCES)
+        run_history.save_json(os.path.join(self._backend.internals_directory, f'run_history_{stacking_layer}.json'),
+                              save_external=True)
+        return model_identifiers
+
+    def _reset_datamanager_in_backend(self, datamanager)-> None:
+        self._backend.save_datamanager(datamanager)
+
+    def _run_autogluon_stacking(
+        self,
+        optimize_metric: str,
+        dataset: BaseDataset,
+        max_budget: int = 50,
+        budget_type: str = 'epochs',
+        total_walltime_limit: int = 100,
+        func_eval_time_limit_secs: Optional[int] = None,
+        memory_limit: Optional[int] = 4096,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        dask_client: Optional[dask.distributed.Client] = None,
+    ):
+        """
+        This function can be used to create a stacking ensemble
+        Args:
+            current_task_name (str): name of the current task,
+            runtime_limit (int): time limit for fitting traditional models,
+            func_eval_time_limit_secs (int): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit.
+        """
+        experiment_task_name: str = 'runStacking'
+        self._init_required_args(
+            experiment_task_name=experiment_task_name,
+            optimize_metric=optimize_metric,
+            dataset=dataset,
+            budget_type=budget_type,
+            max_budget=max_budget,
+            total_walltime_limit=total_walltime_limit,
+            memory_limit=memory_limit,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            dask_client=dask_client
+        )
+        self.pipeline_options['func_eval_time_limit_secs'] = func_eval_time_limit_secs
+        self.precision = precision
+        available_classifiers = get_available_traditional_learners(dataset_properties=self._get_dataset_properties(self.dataset))
+        model_configs = [(key, self.pipeline_options[self.pipeline_options['budget_type']]) for key in available_classifiers.keys()]
+
+        if self.feat_type is None:
+            raise ValueError("Cant run autogluon stacking without information about dataset features passed with `feat_type`")
+        autogluon_nn_search_space_updates = get_autogluon_default_nn_config(feat_type=self.feat_type)
+        autogluon_nn_search_space = self._get_search_space(
+                self.dataset,
+                include_components=self.include_components,
+                exclude_components=self.exclude_components,
+                search_space_updates=autogluon_nn_search_space_updates)
+
+        default_nn_config = autogluon_nn_search_space.get_default_configuration()
+        model_configs.append((default_nn_config, self.pipeline_options[self.pipeline_options['budget_type']]))
+        self._logger.info("Starting Autogluon Stacking.")
+
+        model_identifiers = []
+        stacked_weights = []
+        last_successful_smac_initial_num_run = None
+        for stacking_layer in range(self.num_stacking_layers):
+            smac_initial_run=self._backend.get_next_num_run()
+            updated_model_configs, current_search_space = self._update_configs_for_current_config_space(
+                model_configs,
+                dataset,
+                autogluon_nn_search_space_updates,
+                assert_skew_transformer_quantile=True)
+            layer_model_identifiers = self._fit_models_on_dataset(
+                updated_model_configs,
+                func_eval_time_limit_secs,
+                stacking_layer,
+                time_left=(0.9*total_walltime_limit)/(self.num_stacking_layers),
+                current_search_space=current_search_space,
+                smac_initial_run=smac_initial_run,
+                search_space_updates=autogluon_nn_search_space_updates)
+            nonnull_identifiers = [identifier for identifier in layer_model_identifiers if identifier is not None]
+            if len(nonnull_identifiers) > 0:
+                model_identifiers.append(
+                    nonnull_identifiers
+                )
+                last_successful_smac_initial_num_run = smac_initial_run
+                ensemble_size = len(nonnull_identifiers)
+                weights = [1/ensemble_size] * ensemble_size
+                stacked_weights.append(weights)
+            _, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size)
+            dataset = get_appended_dataset(
+                original_dataset=self.dataset,
+                previous_layer_predictions_train=previous_layer_predictions_train,
+                previous_layer_predictions_test=previous_layer_predictions_test,
+                resampling_strategy=self.resampling_strategy,
+                resampling_strategy_args=self.resampling_strategy_args,
+            )
+            self._reset_datamanager_in_backend(datamanager=dataset)
+
+        ensemble = AutogluonStackingEnsemble()
+        iteration = 0
+        time_left_for_ensemble = total_walltime_limit-self._stopwatch.wall_elapsed(experiment_task_name)
+        final_model_identifiers, final_weights = self._posthoc_fit_ensemble(
+            optimize_metric,
+            time_left_for_ensemble,
+            last_successful_smac_initial_num_run,
+            ensemble_size,
+            iteration)
+        model_identifiers[-1] = final_model_identifiers
+        stacked_weights[-1] = final_weights
+        ensemble = ensemble.fit(model_identifiers, stacked_weights)
+        self._backend.save_ensemble(ensemble, iteration+1, self.seed)
+        self._load_models()
+
+    def _posthoc_fit_ensemble(
+        self,
+        optimize_metric,
+        time_left_for_ensemble,
+        last_successful_smac_initial_num_run,
+        ensemble_size,
+        iteration,
+        enable_traditional_pipeline=False,
+        cleanup=True,
+        func_eval_time_limit_secs: int = 50,
+    ):
+        self.fit_ensemble(
+            optimize_metric=optimize_metric,
+            precision=self.precision,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=self.ensemble_nbest,
+            initial_num_run=last_successful_smac_initial_num_run,
+            time_for_task=time_left_for_ensemble,
+            enable_traditional_pipeline=enable_traditional_pipeline,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            iteration=iteration,
+            cleanup=cleanup,
+            load_models=False
+        )
+        final_ensemble: EnsembleSelection = self._backend.load_ensemble(self.seed)
+        final_model_identifiers = final_ensemble.get_selected_model_identifiers()
+        final_model_identifiers_dict = {identifier: identifier for identifier in final_model_identifiers}
+        models_with_weights = final_ensemble.get_models_with_weights(final_model_identifiers_dict)
+        final_model_identifiers = [identifier[1] for identifier in models_with_weights]
+        final_weights = [identifier[0] for identifier in models_with_weights]
+        return final_model_identifiers,final_weights
+
+    def _run_search_stacking(
+        self,
+        optimize_metric: str,
+        min_budget,
+        max_budget,
+        precision,
+        portfolio_selection,
+        experiment_task_name,
+        tae_func = None,
+        budget_type: str = 'epochs',
+        total_walltime_limit: int = 400,
+        func_eval_time_limit_secs: Optional[int] = None,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+    ):
+        stacking_task_name = "runStacking"
+        self._stopwatch.start_task(stacking_task_name)
+        self.precision = precision
+        self.opt_metric = optimize_metric
+        time_left_for_search_base_models = math.floor(0.5*total_walltime_limit)
+        proc_ensemble = None
+        if time_left_for_search_base_models <= 0:
+            # Fit only raises error when ensemble_size is not zero but
+            # time_left_for_search_base_models is zero.
+            if self.ensemble_size > 0:
+                raise ValueError("Not starting ensemble builder because there "
+                                "is no time left. Try increasing the value "
+                                "of time_left_for_this_task.")
+        elif self.ensemble_size <= 0:
+            self._logger.info("Not starting ensemble builder as ensemble size is 0")
+        else:
+            self._logger.info("Starting ensemble")
+            proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_search_base_models,
+                                                        ensemble_size=self.ensemble_size,
+                                                        ensemble_nbest=self.ensemble_nbest,
+                                                        precision=precision,
+                                                        optimize_metric=self.opt_metric,
+                                                        ensemble_method=self.ensemble_method,
+                                                        num_stacking_layers=1
+                                                        )
+
+        smac_initial_run = self._run_smbo(
+            min_budget=min_budget,
+            max_budget=max_budget,
+            total_walltime_limit=time_left_for_search_base_models,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            smac_scenario_args=smac_scenario_args,
+            get_smac_object_callback=get_smac_object_callback,
+            tae_func=tae_func,
+            portfolio_selection=portfolio_selection,
+            experiment_task_name=experiment_task_name,
+            proc_ensemble=proc_ensemble,
+            num_stacking_layers=1
+        )
+        if proc_ensemble is not None:
+            self._collect_results_ensemble(proc_ensemble)
+        base_ensemble = self._backend.load_ensemble(self.seed)
+        model_identifiers = [base_ensemble.get_selected_model_identifiers()]
+        ensemble = RepeatModelsStackingEnsemble(base_ensemble=base_ensemble)
+
+        weights = [weight for weight in base_ensemble.weights_ if weight > 0]
+        ensemble_size = self.ensemble_size
+        model_configs, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size)
+
+        self._logger.debug(f"Finished search for base models, starting fitting next layers")
+        for stacking_layer in range(1, self.num_stacking_layers):
+            smac_layer_initial_run = self._backend.get_next_num_run()
+            time_left_for_higher_stacking_layers = total_walltime_limit -self._stopwatch.wall_elapsed(stacking_task_name)
+            if time_left_for_higher_stacking_layers < func_eval_time_limit_secs:
+                break
+            self._logger.debug(f"Original feat types len: {len(self.dataset.feat_type)}")
+            nonnull_model_predictions_train = [pred for pred in previous_layer_predictions_train if pred is not None]
+            nonnull_model_predictions_test = [pred for pred in previous_layer_predictions_test if pred is not None]
+            assert len(nonnull_model_predictions_train) == len(nonnull_model_predictions_test)
+            self._logger.debug(f"length Non null predictions: {len(nonnull_model_predictions_train)}")
+            dataset = get_appended_dataset(
+                original_dataset=self.dataset,
+                previous_layer_predictions_train=nonnull_model_predictions_train,
+                previous_layer_predictions_test=nonnull_model_predictions_test,
+                resampling_strategy=self.resampling_strategy,
+                resampling_strategy_args=self.resampling_strategy_args,
+            )
+            self._logger.debug(f"new feat_types len: {len(dataset.feat_type)}")
+            updated_model_configs, current_search_space = self._update_configs_for_current_config_space(model_configs, dataset)
+            self._reset_datamanager_in_backend(datamanager=dataset)
+            layer_model_identifiers = self._fit_models_on_dataset(updated_model_configs, func_eval_time_limit_secs, stacking_layer, time_left=time_left_for_higher_stacking_layers/(self.num_stacking_layers - 1), current_search_space=current_search_space, smac_initial_run=smac_layer_initial_run)
+            if any([identifier is not None for identifier in layer_model_identifiers]):
+                model_identifiers.append(
+                    layer_model_identifiers
+                )
+            _, previous_layer_predictions_train, previous_layer_predictions_test = self._get_previous_predictions(smac_initial_run, model_identifiers[-1], weights, ensemble_size)
+
+        ensemble = ensemble.fit(model_identifiers)
+        self._backend.save_ensemble(ensemble, proc_ensemble.iteration+10, self.seed)
+        self._load_models()
+
+    def _get_previous_predictions(self, smac_initial_run, model_identifiers, weights, ensemble_size):
+        model_configs = []
+        previous_layer_predictions_train = []
+        previous_layer_predictions_test = []
+        self._logger.debug(f'id_config: {self.run_history.ids_config}')
+        for weight, model_identifier in zip(weights, model_identifiers):
+            if model_identifier is None:
+                model_configs.append(None)
+                previous_layer_predictions_train.append(None)
+                previous_layer_predictions_test.append(None)
+                continue
+            seed, num_run, budget = model_identifier
+            
+            self._logger.debug(f'num_run: {num_run}')
+            config = get_config_from_run_history(self.run_history, num_run=num_run) #  self.run_history.ids_config.get(num_run-smac_initial_run, None)
+            self._logger.debug(f'Configuration from previous layer: {config}')
+            model_configs.append((config, budget))
+            previous_layer_predictions_train.extend(
+                [np.load(os.path.join(
+                    self._backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget),
+                    self._backend.get_prediction_filename('ensemble', seed, num_run, budget)
+                    ), allow_pickle=True)] * int(weight * ensemble_size))
+            previous_layer_predictions_test.extend([np.load(os.path.join(
+                self._backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget),
+                self._backend.get_prediction_filename('test', seed, num_run, budget)
+                ), allow_pickle=True)] * int(weight * ensemble_size))
+        return model_configs,previous_layer_predictions_train,previous_layer_predictions_test
+
+    def _update_configs_for_current_config_space(
+        self,
+        model_description: List[Tuple],
+        dataset: BaseDataset,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        assert_skew_transformer_quantile: bool = False
+        ) -> List[Tuple]:
+        
+        search_space_updates = search_space_updates if search_space_updates is not None else self.search_space_updates
+
+        dataset_properties = self._get_dataset_properties(dataset=dataset)
+        current_search_space = self._get_search_space(
+                                                dataset,
+                                                include_components=self.include_components,
+                                                exclude_components=self.exclude_components,
+                                                search_space_updates=search_space_updates)
+        self._logger.debug(f"dataset properties after appending predictions: {dict_repr(dataset_properties)}")
+        n_numerical_in_incumbent_on_task_id = len(self.dataset.numerical_columns)
+        num_numerical = len(dataset.numerical_columns)
+        updated_model_descriptions = []
+        for config, budget in model_description:
+            if config is None:
+                continue
+
+            if not isinstance(config, (Configuration, dict)):
+                updated_model_descriptions.append((config, budget))
+                continue
+
+            updated_config = validate_config(
+                        config=config,
+                        search_space=current_search_space,
+                        num_numerical=num_numerical,
+                        n_numerical_in_incumbent_on_task_id=n_numerical_in_incumbent_on_task_id,
+                        assert_autogluon_numerical_hyperparameters=assert_skew_transformer_quantile
+                    )
+            updated_model_descriptions.append((updated_config, budget))
+        return updated_model_descriptions, current_search_space
+
+    def _run_smbo(
+        self,
+        min_budget,
+        max_budget,
+        total_walltime_limit,
+        func_eval_time_limit_secs,
+        smac_scenario_args,
+        portfolio_selection,
+        experiment_task_name,
+        proc_ensemble,
+        num_stacking_layers,
+        get_smac_object_callback=None,
+        tae_func=None,
+        smbo_class=None,
+    ) -> int:
+        smac_initial_num_run = self._backend.get_next_num_run(peek=True)
+        proc_runhistory_updater = None
+        if (
+            self.ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble
+            and smbo_class is not None
+        ):
+            proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run)
+
+        # ==> Run SMAC
+        smac_task_name: str = 'runSMAC'
+        self._stopwatch.start_task(smac_task_name)
+        elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
+        time_left_for_smac = max(0, total_walltime_limit - elapsed_time)
+
+        self._logger.info("Starting SMAC with %5.2f sec time left" % time_left_for_smac)
+        if time_left_for_smac <= 0:
+            self._logger.warning(" Not starting SMAC because there is no time left")
+        else:
+            _proc_smac = AutoMLSMBO(
+                config_space=self.search_space,
+                dataset_name=str(self.dataset_name),
+                backend=self._backend,
+                total_walltime_limit=total_walltime_limit,
+                func_eval_time_limit_secs=func_eval_time_limit_secs,
+                dask_client=self._dask_client,
+                memory_limit=self._memory_limit,
+                n_jobs=self.n_jobs,
+                watcher=self._stopwatch,
+                metric=self._metric,
+                seed=self.seed,
+                include=self.include_components,
+                exclude=self.exclude_components,
+                disable_file_output=self._disable_file_output,
+                all_supported_metrics=self._all_supported_metrics,
+                smac_scenario_args=smac_scenario_args,
+                get_smac_object_callback=get_smac_object_callback,
+                pipeline_config=self.pipeline_options,
+                min_budget=min_budget,
+                max_budget=max_budget,
+                ensemble_callback=proc_ensemble,
+                ensemble_method=self.ensemble_method,
+                logger_port=self._logger_port,
+                resampling_strategy=self.resampling_strategy,
+                resampling_strategy_args=self.resampling_strategy_args,
+                # We do not increase the num_run here, this is something
+                # smac does internally
+                start_num_run=smac_initial_num_run,
+                search_space_updates=self.search_space_updates,
+                portfolio_selection=portfolio_selection,
+                pynisher_context=self._multiprocessing_context,
+                smbo_class=smbo_class,
+                use_ensemble_opt_loss=self.use_ensemble_opt_loss,
+                other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None,
+                num_stacking_layers=num_stacking_layers
+            )
+            try:
+                run_history, self._results_manager.trajectory, budget_type = \
+                    _proc_smac.run_smbo(func=tae_func)
+                self.run_history.update(run_history, DataOrigin.INTERNAL)
+                trajectory_filename = os.path.join(
+                    self._backend.get_smac_output_directory_for_run(self.seed),
+                    'trajectory.json')
+
+                assert self.trajectory is not None  # mypy check
+                saveable_trajectory = \
+                    [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:])
+                     for entry in self.trajectory]
+                try:
+                    with open(trajectory_filename, 'w') as fh:
+                        json.dump(saveable_trajectory, fh)
+                except Exception as e:
+                    self._logger.warning(f"Cannot save {trajectory_filename} due to {e}...")
+            except Exception as e:
+                self._logger.exception(str(e))
+                raise
+        return smac_initial_num_run
+
     def _search(
         self,
         optimize_metric: str,
@@ -979,7 +1389,8 @@ def _search(
         portfolio_selection: Optional[str] = None,
         dask_client: Optional[dask.distributed.Client] = None,
         smbo_class: Optional[SMBO] = None,
-        use_ensemble_opt_loss: bool = False
+        use_ensemble_opt_loss: bool = False,
+        posthoc_ensemble_fit_stacking_ensemble_optimization: bool = False
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1112,6 +1523,184 @@ def _search(
             self
 
         """
+        experiment_task_name: str = 'runSearch'
+
+        self._init_required_args(
+            experiment_task_name=experiment_task_name,
+            optimize_metric=optimize_metric,
+            dataset=dataset,
+            budget_type=budget_type,
+            max_budget=max_budget,
+            total_walltime_limit=total_walltime_limit,
+            memory_limit=memory_limit,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            dask_client=dask_client
+        )
+
+        # Handle time resource allocation
+        elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
+        time_left_for_modelfit = int(max(0, total_walltime_limit - elapsed_time))
+        if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_left_for_modelfit:
+            self._logger.warning(
+                'Time limit for a single run is higher than total time '
+                'limit. Capping the limit for a single run to the total '
+                'time given to SMAC (%f)' % time_left_for_modelfit
+            )
+            func_eval_time_limit_secs = time_left_for_modelfit
+
+        # Make sure that at least 2 models are created for the ensemble process
+        num_models = time_left_for_modelfit // func_eval_time_limit_secs
+        if num_models < 2 and self.ensemble_size > 0:
+            func_eval_time_limit_secs = time_left_for_modelfit // 2
+            self._logger.warning(
+                "Capping the func_eval_time_limit_secs to {} to have "
+                "time for a least 2 models to ensemble.".format(
+                    func_eval_time_limit_secs
+                )
+            )
+
+        self.pipeline_options['func_eval_time_limit_secs'] = func_eval_time_limit_secs
+        # ============> Run dummy predictions
+        # We only want to run dummy predictions in case we want to build an ensemble
+        if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_optimisation_ensemble:
+            dummy_task_name = 'runDummy'
+            self._stopwatch.start_task(dummy_task_name)
+            self._do_dummy_prediction()
+            self._stopwatch.stop_task(dummy_task_name)
+
+        # ============> Run traditional ml
+        # We only want to run traditional predictions in case we want to build an ensemble
+        # We want time for at least 1 Neural network in SMAC
+        if enable_traditional_pipeline and self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_optimisation_ensemble:
+            traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
+            self.run_traditional_ml(current_task_name=self.dataset_name,
+                                    runtime_limit=traditional_runtime_limit,
+                                    func_eval_time_limit_secs=func_eval_time_limit_secs)
+
+        # ============> Starting ensemble
+        self.use_ensemble_opt_loss = use_ensemble_opt_loss
+        if self.ensemble_method == EnsembleSelectionTypes.stacking_repeat_models:
+            elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
+            time_left_for_stacking = max(0, total_walltime_limit - elapsed_time)
+            self._run_search_stacking(
+                optimize_metric=optimize_metric,
+                min_budget=min_budget,
+                max_budget=max_budget,
+                smac_scenario_args=smac_scenario_args,
+                total_walltime_limit=time_left_for_stacking,
+                func_eval_time_limit_secs=func_eval_time_limit_secs,
+                budget_type=budget_type,
+                portfolio_selection=portfolio_selection,
+                tae_func=tae_func,
+                precision=precision,
+                experiment_task_name=experiment_task_name
+            )
+        else:
+            self.precision = precision
+            self.opt_metric = optimize_metric
+            elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
+            time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time)
+            posthoc_ensemble_fit_stacking_ensemble_optimization = posthoc_ensemble_fit_stacking_ensemble_optimization \
+                and self.ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble
+            TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT = 0.95
+            time_left_for_ensembles = int(time_left_for_ensembles * TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT) if posthoc_ensemble_fit_stacking_ensemble_optimization else time_left_for_ensembles
+            proc_ensemble = None
+            if time_left_for_ensembles <= 0:
+                # Fit only raises error when ensemble_size is not zero but
+                # time_left_for_ensembles is zero.
+                if self.ensemble_size > 0:
+                    raise ValueError("Not starting ensemble builder because there "
+                                    "is no time left. Try increasing the value "
+                                    "of time_left_for_this_task.")
+            elif self.ensemble_size <= 0:
+                self._logger.info("Not starting ensemble builder as ensemble size is 0")
+            else:
+                self._logger.info("Starting ensemble")
+                proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
+                                                            ensemble_size=self.ensemble_size,
+                                                            ensemble_nbest=self.ensemble_nbest,
+                                                            precision=precision,
+                                                            optimize_metric=self.opt_metric,
+                                                            ensemble_method=self.ensemble_method,
+                                                            num_stacking_layers=self.num_stacking_layers
+                                                            )
+
+            self._run_smbo(
+                min_budget=min_budget,
+                max_budget=max_budget,
+                total_walltime_limit=total_walltime_limit * TIME_ALLOCATION_FACTOR_POSTHOC_ENSEMBLE_FIT \
+                    if posthoc_ensemble_fit_stacking_ensemble_optimization \
+                        else total_walltime_limit,
+                func_eval_time_limit_secs=func_eval_time_limit_secs,
+                smac_scenario_args=smac_scenario_args,
+                get_smac_object_callback=get_smac_object_callback,
+                tae_func=tae_func,
+                portfolio_selection=portfolio_selection,
+                smbo_class=smbo_class,
+                experiment_task_name=experiment_task_name,
+                proc_ensemble=proc_ensemble,
+                num_stacking_layers=self.num_stacking_layers
+                )
+
+            if proc_ensemble is not None:
+                self._collect_results_ensemble(proc_ensemble)
+            # Wait until the ensemble process is finished to avoid shutting down
+            # while the ensemble builder tries to access the data
+            self._logger.info("Starting Shutdown")
+
+        if posthoc_ensemble_fit_stacking_ensemble_optimization:
+            ensemble = self._backend.load_ensemble(self.seed)
+            initial_num_run = int(open(os.path.join(self._backend.internals_directory, 'ensemble_cutoff_run.txt'), 'r').read())
+            time_for_post_fit_ensemble = max(0, total_walltime_limit-self._stopwatch.wall_elapsed(self.dataset_name))
+            iteration = (self.num_stacking_layers+1)*ENSEMBLE_ITERATION_MULTIPLIER
+            final_model_identifiers, final_weights = self._posthoc_fit_ensemble(
+                optimize_metric=self.opt_metric,
+                time_left_for_ensemble=time_for_post_fit_ensemble,
+                last_successful_smac_initial_num_run=initial_num_run + 1,
+                ensemble_size=self.ensemble_size,
+                iteration=iteration,
+                enable_traditional_pipeline=enable_traditional_pipeline,
+                cleanup=False,
+                func_eval_time_limit_secs=0.5*func_eval_time_limit_secs
+            )
+            ensemble.identifiers_ = final_model_identifiers
+            stacked_ensemble_identifiers = ensemble.stacked_ensemble_identifiers
+            broken = False
+            for i, layer_identifiers in enumerate(stacked_ensemble_identifiers):
+                if all([identifier is None for identifier in layer_identifiers]):
+                    broken = True
+                    break
+            last_nonnull_layer = i-1 if broken else i
+            self._logger.debug(f"broken: {broken}, lastnonnull layer: {last_nonnull_layer}, i: {i}")
+            ensemble.stacked_ensemble_identifiers[last_nonnull_layer] = final_model_identifiers
+            ensemble.weights_ = final_weights
+            self._backend.save_ensemble(ensemble, iteration+1, self.seed)
+
+        if load_models:
+            self._logger.info("Loading models...")
+            self._load_models()
+            self._logger.info("Finished loading models...")
+
+        self._cleanup()
+
+        return self
+
+    def _init_required_args(
+        self,
+        experiment_task_name: str,
+        optimize_metric: str,
+        dataset: BaseDataset,
+        budget_type: str,
+        max_budget: int,
+        total_walltime_limit: int,
+        memory_limit: int,
+        all_supported_metrics: bool,
+        precision: int,
+        dask_client: Optional[dask.distributed.Client] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None
+    ) -> None:
         if self.task_type != dataset.task_type:
             raise ValueError("Incompatible dataset entered for current task,"
                              "expected dataset to have task type :{} but got "
@@ -1120,14 +1709,7 @@ def _search(
             raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision))
 
         # Initialise information needed for the experiment
-        experiment_task_name: str = 'runSearch'
-        dataset_requirements = get_dataset_requirements(
-            info=dataset.get_required_dataset_info(),
-            include=self.include_components,
-            exclude=self.exclude_components,
-            search_space_updates=self.search_space_updates)
-        self._dataset_requirements = dataset_requirements
-        dataset_properties = dataset.get_dataset_properties(dataset_requirements)
+        dataset_properties = self._get_dataset_properties(dataset)
         self._stopwatch.start_task(experiment_task_name)
         self.dataset_name = dataset.dataset_name
         assert self.dataset_name is not None
@@ -1195,161 +1777,16 @@ def _search(
         else:
             self._dask_client = dask_client
             self._is_dask_client_internally_created = False
+        return
 
-        # Handle time resource allocation
-        elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
-        time_left_for_modelfit = int(max(0, total_walltime_limit - elapsed_time))
-        if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_left_for_modelfit:
-            self._logger.warning(
-                'Time limit for a single run is higher than total time '
-                'limit. Capping the limit for a single run to the total '
-                'time given to SMAC (%f)' % time_left_for_modelfit
-            )
-            func_eval_time_limit_secs = time_left_for_modelfit
-
-        # Make sure that at least 2 models are created for the ensemble process
-        num_models = time_left_for_modelfit // func_eval_time_limit_secs
-        if num_models < 2 and self.ensemble_size > 0:
-            func_eval_time_limit_secs = time_left_for_modelfit // 2
-            self._logger.warning(
-                "Capping the func_eval_time_limit_secs to {} to have "
-                "time for a least 2 models to ensemble.".format(
-                    func_eval_time_limit_secs
-                )
-            )
-
-        # ============> Run dummy predictions
-        # We only want to run dummy predictions in case we want to build an ensemble
-        if self.ensemble_size > 0 and self.ensemble_method != EnsembleSelectionTypes.stacking_ensemble:
-            dummy_task_name = 'runDummy'
-            self._stopwatch.start_task(dummy_task_name)
-            self._do_dummy_prediction()
-            self._stopwatch.stop_task(dummy_task_name)
-
-        # ============> Run traditional ml
-        # We only want to run traditional predictions in case we want to build an ensemble
-        # We want time for at least 1 Neural network in SMAC
-        if enable_traditional_pipeline and self.ensemble_size > 0:
-            traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
-            self.run_traditional_ml(current_task_name=self.dataset_name,
-                                    runtime_limit=traditional_runtime_limit,
-                                    func_eval_time_limit_secs=func_eval_time_limit_secs)
-
-        # ============> Starting ensemble
-        self.use_ensemble_opt_loss = use_ensemble_opt_loss
-        self.precision = precision
-        self.opt_metric = optimize_metric
-        elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
-        time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time)
-        proc_ensemble = None
-        if time_left_for_ensembles <= 0:
-            # Fit only raises error when ensemble_size is not zero but
-            # time_left_for_ensembles is zero.
-            if self.ensemble_size > 0:
-                raise ValueError("Not starting ensemble builder because there "
-                                 "is no time left. Try increasing the value "
-                                 "of time_left_for_this_task.")
-        elif self.ensemble_size <= 0:
-            self._logger.info("Not starting ensemble builder as ensemble size is 0")
-        else:
-            self._logger.info("Starting ensemble")
-            proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
-                                                        ensemble_size=self.ensemble_size,
-                                                        ensemble_nbest=self.ensemble_nbest,
-                                                        precision=precision,
-                                                        optimize_metric=self.opt_metric,
-                                                        ensemble_method=self.ensemble_method
-                                                        )
-
-        smac_initial_num_run = self._backend.get_next_num_run(peek=True)
-
-        proc_runhistory_updater = None
-        if (
-            self.ensemble_method == EnsembleSelectionTypes.stacking_ensemble
-            and smbo_class is not None
-        ):
-            proc_runhistory_updater = self._init_result_history_updater(initial_num_run=smac_initial_num_run)
-
-        # ==> Run SMAC
-        smac_task_name: str = 'runSMAC'
-        self._stopwatch.start_task(smac_task_name)
-        elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
-        time_left_for_smac = max(0, total_walltime_limit - elapsed_time)
-
-        self._logger.info("Starting SMAC with %5.2f sec time left" % time_left_for_smac)
-        if time_left_for_smac <= 0:
-            self._logger.warning(" Not starting SMAC because there is no time left")
-        else:
-
-            _proc_smac = AutoMLSMBO(
-                config_space=self.search_space,
-                dataset_name=str(dataset.dataset_name),
-                backend=self._backend,
-                total_walltime_limit=total_walltime_limit,
-                func_eval_time_limit_secs=func_eval_time_limit_secs,
-                dask_client=self._dask_client,
-                memory_limit=self._memory_limit,
-                n_jobs=self.n_jobs,
-                watcher=self._stopwatch,
-                metric=self._metric,
-                seed=self.seed,
-                include=self.include_components,
-                exclude=self.exclude_components,
-                disable_file_output=self._disable_file_output,
-                all_supported_metrics=self._all_supported_metrics,
-                smac_scenario_args=smac_scenario_args,
-                get_smac_object_callback=get_smac_object_callback,
-                pipeline_config=self.pipeline_options,
-                min_budget=min_budget,
-                max_budget=max_budget,
-                ensemble_callback=proc_ensemble,
-                ensemble_method=self.ensemble_method,
-                logger_port=self._logger_port,
-                # We do not increase the num_run here, this is something
-                # smac does internally
-                start_num_run=smac_initial_num_run,
-                search_space_updates=self.search_space_updates,
-                portfolio_selection=portfolio_selection,
-                pynisher_context=self._multiprocessing_context,
-                smbo_class = smbo_class,
-                use_ensemble_opt_loss=self.use_ensemble_opt_loss,
-                other_callbacks=[proc_runhistory_updater] if proc_runhistory_updater is not None else None
-            )
-            try:
-                run_history, self._results_manager.trajectory, budget_type = \
-                    _proc_smac.run_smbo(func=tae_func)
-                self.run_history.update(run_history, DataOrigin.INTERNAL)
-                trajectory_filename = os.path.join(
-                    self._backend.get_smac_output_directory_for_run(self.seed),
-                    'trajectory.json')
-
-                assert self.trajectory is not None  # mypy check
-                saveable_trajectory = \
-                    [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:])
-                     for entry in self.trajectory]
-                try:
-                    with open(trajectory_filename, 'w') as fh:
-                        json.dump(saveable_trajectory, fh)
-                except Exception as e:
-                    self._logger.warning(f"Cannot save {trajectory_filename} due to {e}...")
-            except Exception as e:
-                self._logger.exception(str(e))
-                raise
-        # Wait until the ensemble process is finished to avoid shutting down
-        # while the ensemble builder tries to access the data
-        self._logger.info("Starting Shutdown")
-
-        if proc_ensemble is not None:
-            self._collect_results_ensemble(proc_ensemble)
-
-        if load_models:
-            self._logger.info("Loading models...")
-            self._load_models()
-            self._logger.info("Finished loading models...")
-
-        self._cleanup()
-
-        return self
+    def _get_dataset_properties(self, dataset):
+        dataset_requirements = get_dataset_requirements(
+            info=dataset.get_required_dataset_info(),
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=self.search_space_updates)
+        dataset_properties = dataset.get_dataset_properties(dataset_requirements)
+        return dataset_properties
 
     def _get_fit_dictionary(
         self,
@@ -1451,7 +1888,7 @@ def fit_pipeline(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
-        resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         run_time_limit_secs: int = 60,
         memory_limit: Optional[int] = None,
@@ -1631,7 +2068,7 @@ def fit_pipeline(
 
         pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \
             else self.pipeline_options.copy()
-
+        pipeline_options['func_eval_time_limit_secs'] = run_time_limit_secs
         assert pipeline_options is not None
 
         if budget_type is not None:
@@ -1727,10 +2164,14 @@ def fit_ensemble(
             ensemble_nbest: int = 50,
             ensemble_size: int = 50,
             ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+            num_stacking_layers: int = 1,
+            initial_num_run: int = 0,
             load_models: bool = True,
             time_for_task: int = 100,
             func_eval_time_limit_secs: int = 50,
             enable_traditional_pipeline: bool = True,
+            iteration: int = 0,
+            cleanup: bool = True
     ) -> 'BaseTask':
         """
         Enables post-hoc fitting of the ensemble after the `search()`
@@ -1778,7 +2219,7 @@ def fit_ensemble(
             self
         """
         # Make sure that input is valid
-        if self.dataset is None or self.opt_metric is None:
+        if self.dataset is None:
             raise ValueError("fit_ensemble() can only be called after `search()`. "
                              "Please call the `search()` method of {} prior to "
                              "fit_ensemble().".format(self.__class__.__name__))
@@ -1837,6 +2278,10 @@ def fit_ensemble(
             precision=precision,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            ensemble_method=ensemble_method,
+            num_stacking_layers=num_stacking_layers,
+            initial_num_run=initial_num_run,
+            iteration=iteration
         )
 
         manager.build_ensemble(self._dask_client)
@@ -1848,7 +2293,8 @@ def fit_ensemble(
 
         self._stopwatch.stop_task(ensemble_fit_task_name)
 
-        self._cleanup()
+        if cleanup:
+            self._cleanup()
 
         return self
 
@@ -1859,7 +2305,10 @@ def _init_ensemble_builder(
             ensemble_method: int,
             ensemble_nbest: int,
             ensemble_size: int,
+            num_stacking_layers: Optional[int] = None,
             precision: int = 32,
+            initial_num_run: int = 0,
+            iteration: int = 0,
     ) -> EnsembleBuilderManager:
         """
         Initializes an `EnsembleBuilderManager`.
@@ -1919,7 +2368,10 @@ def _init_ensemble_builder(
             random_state=self.seed,
             precision=precision,
             logger_port=self._logger_port,
-            use_ensemble_loss=self.use_ensemble_opt_loss
+            use_ensemble_loss=self.use_ensemble_opt_loss,
+            num_stacking_layers=num_stacking_layers,
+            initial_num_run=initial_num_run,
+            iteration=iteration
         )
         self._stopwatch.stop_task(ensemble_task_name)
 
@@ -2000,19 +2452,51 @@ def predict(
 
         # Mypy assert
         assert self.ensemble_ is not None, "Load models should error out if no ensemble"
+        predictions = self._predict_with_ensemble(X_test=X_test, batch_size=batch_size, n_jobs=n_jobs)        
+
+        self._cleanup()
+
+        return predictions
+
 
+    def _predict_with_ensemble(self, X_test, batch_size, n_jobs) -> np.ndarray:
+
+        assert self.ensemble_ is not None, "Load models should error out if no ensemble"
         if isinstance(self.resampling_strategy, (HoldoutValTypes, NoResamplingStrategyTypes)):
             models = self.models_
-        elif isinstance(self.resampling_strategy, CrossValTypes):
+        elif isinstance(self.resampling_strategy, (CrossValTypes, RepeatedCrossValTypes)):
             models = self.cv_models_
 
-        all_predictions = joblib.Parallel(n_jobs=n_jobs)(
-            joblib.delayed(_pipeline_predict)(
-                models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
-            )
-            for identifier in self.ensemble_.get_selected_model_identifiers() if identifier is not None
-        )
+        X_test_copy = X_test.copy()
+        if self.ensemble_method.is_stacking_ensemble():
+            ensemble_identifiers = self.ensemble_.get_selected_model_identifiers()
+            self._logger.debug(f"ensemble identifiers: {ensemble_identifiers}")
+            for i, (model, layer_identifiers) in enumerate(zip(models, ensemble_identifiers)):
+                if all([identifier is None for identifier in layer_identifiers]):
+                    break
+                self._logger.debug(f"layer : {i} of stacking ensemble,\n layer identifiers: {layer_identifiers},\n model: {model}")
+                all_predictions = joblib.Parallel(n_jobs=n_jobs)(
+                    joblib.delayed(_pipeline_predict)(
+                        model[identifier], X_test_copy, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
+                    )
+                    for identifier in layer_identifiers if identifier is not None
+                )
+                if self.ensemble_method in (EnsembleSelectionTypes.stacking_ensemble_selection_per_layer, EnsembleSelectionTypes.stacking_repeat_models, EnsembleSelectionTypes.stacking_autogluon):
+                    concat_all_predictions = self.ensemble_.get_expanded_layer_stacking_ensemble_predictions(
+                        stacking_layer=i, raw_stacking_layer_ensemble_predictions=all_predictions)
+                else:
+                    concat_all_predictions = all_predictions
+
+                X_test_copy = np.concatenate([X_test, *concat_all_predictions], axis=1)
 
+        else:
+            all_predictions = joblib.Parallel(n_jobs=n_jobs)(
+                    joblib.delayed(_pipeline_predict)(
+                        models[identifier], X_test_copy, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
+                    )
+                    for identifier in self.ensemble_.get_selected_model_identifiers()
+                )
+    
         if len(all_predictions) == 0:
             raise ValueError('Something went wrong generating the predictions. '
                              'The ensemble should consist of the following '
@@ -2118,13 +2602,23 @@ def show_models(self) -> str:
             str:
                 Markdown table of models.
         """
-        df = []
-        for weight, model in self.get_models_with_weights():
-            representation = model.get_pipeline_representation()
-            representation.update({'Weight': weight})
-            df.append(representation)
-        models_markdown: str = pd.DataFrame(df).to_markdown()
-        return models_markdown
+        if self.ensemble_method.is_stacking_ensemble():
+            df = []
+            for layer, model_weight in enumerate(self.get_models_with_weights()):
+                for weight, model in model_weight:
+                    representation = model.get_pipeline_representation()
+                    representation.update({'Weight': weight, "Stacking Layer": layer})
+                    df.append(representation)
+            models_markdown: str = pd.DataFrame(df).to_markdown()
+            return models_markdown
+        else:
+            df = []
+            for weight, model in self.get_models_with_weights():
+                representation = model.get_pipeline_representation()
+                representation.update({'Weight': weight})
+                df.append(representation)
+            models_markdown: str = pd.DataFrame(df).to_markdown()
+            return models_markdown
 
     def _print_debug_info_to_log(self) -> None:
         """
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 3e6354c03..f1ac64d58 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -1,5 +1,7 @@
 from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
+import dask.distributed
+
 import numpy as np
 
 import pandas as pd
@@ -91,6 +93,7 @@ def __init__(
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
         ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+        num_stacking_layers: int = 1,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -100,6 +103,7 @@ def __init__(
         exclude_components: Optional[Dict[str, Any]] = None,
         resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        feat_type: Optional[List[str]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
     ):
@@ -121,8 +125,10 @@ def __init__(
             backend=backend,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
+            feat_type=feat_type,
             search_space_updates=search_space_updates,
             task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
+            num_stacking_layers=num_stacking_layers
         )
 
     def build_pipeline(
@@ -169,6 +175,7 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        feat_type: Optional[List] = None,
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
@@ -210,13 +217,14 @@ def _get_dataset_input_validator(
         resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
         resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
             self.resampling_strategy_args
-
+        feat_type = feat_type if feat_type is not None else self.feat_type
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         input_validator = TabularInputValidator(
             is_classification=True,
             logger_port=self._logger_port,
-            dataset_compression=dataset_compression
+            dataset_compression=dataset_compression,
+            feat_type=feat_type
         )
 
         # Fit a input validator to check the provided data
@@ -235,6 +243,51 @@ def _get_dataset_input_validator(
 
         return dataset, input_validator
 
+    def run_autogluon_stacking(
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        max_budget: int = 50,
+        budget_type: str = 'epochs',
+        total_walltime_limit: int = 100,
+        func_eval_time_limit_secs: Optional[int] = None,
+        memory_limit: Optional[int] = 4096,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        dask_client: Optional[dask.distributed.Client] = None
+    ):
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
+
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression)
+
+        return self._run_autogluon_stacking(
+            optimize_metric=optimize_metric,
+            dataset=self.dataset,
+            max_budget=max_budget,
+            budget_type=budget_type,
+            total_walltime_limit=total_walltime_limit,
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            memory_limit=memory_limit,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            dask_client=dask_client,
+        )
+
     def search(
         self,
         optimize_metric: str,
@@ -259,7 +312,8 @@ def search(
         portfolio_selection: Optional[str] = None,
         dataset_compression: Union[Mapping[str, Any], bool] = False,
         smbo_class: Optional[SMBO] = None,
-        use_ensemble_opt_loss=False
+        use_ensemble_opt_loss=False,
+        posthoc_ensemble_fit_stacking_ensemble_optimization: bool = False
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -460,7 +514,8 @@ def search(
             load_models=load_models,
             portfolio_selection=portfolio_selection,
             smbo_class=smbo_class,
-            use_ensemble_opt_loss=use_ensemble_opt_loss      
+            use_ensemble_opt_loss=use_ensemble_opt_loss,
+            posthoc_ensemble_fit_stacking_ensemble_optimization=posthoc_ensemble_fit_stacking_ensemble_optimization  
         )
 
     def predict(
@@ -504,3 +559,4 @@ def predict_proba(self,
                              "the estimator search() method.")
         X_test = self.input_validator.feature_validator.transform(X_test)
         return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
+
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index c9f21e453..b932cda2e 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -89,6 +89,7 @@ def __init__(
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
         ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+        num_stacking_layers: int = 1,
         max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
@@ -98,6 +99,7 @@ def __init__(
         exclude_components: Optional[Dict[str, Any]] = None,
         resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        feat_type: Optional[List[str]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
     ):
@@ -109,6 +111,7 @@ def __init__(
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
             ensemble_method=ensemble_method,
+            num_stacking_layers=num_stacking_layers,
             max_models_on_disc=max_models_on_disc,
             temporary_directory=temporary_directory,
             output_directory=output_directory,
@@ -119,6 +122,7 @@ def __init__(
             backend=backend,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
+            feat_type=feat_type,
             search_space_updates=search_space_updates,
             task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
         )
@@ -167,6 +171,7 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        feat_type: Optional[List[str]] = None,
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
@@ -207,13 +212,14 @@ def _get_dataset_input_validator(
         resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
         resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
             self.resampling_strategy_args
-
+        feat_type = feat_type if feat_type is not None else self.feat_type
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         input_validator = TabularInputValidator(
             is_classification=False,
             logger_port=self._logger_port,
-            dataset_compression=dataset_compression
+            dataset_compression=dataset_compression,
+            feat_type=feat_type
         )
 
         # Fit a input validator to check the provided data
diff --git a/autoPyTorch/api/utils.py b/autoPyTorch/api/utils.py
new file mode 100644
index 000000000..4559854f8
--- /dev/null
+++ b/autoPyTorch/api/utils.py
@@ -0,0 +1,139 @@
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from smac.runhistory.runhistory import RunHistory
+
+def get_autogluon_default_nn_config(feat_type):
+    has_numerical_features = "numerical" in feat_type
+    has_cat_features = "categorical" in feat_type
+    search_space_updates = HyperparameterSearchSpaceUpdates()
+
+
+    # architecture head
+    search_space_updates.append(
+        node_name='network_head',
+        hyperparameter='__choice__',
+        value_range=['no_head'],
+        default_value='no_head',
+    )
+    search_space_updates.append(
+        node_name='network_head',
+        hyperparameter='no_head:activation',
+        value_range=['relu', 'elu'],
+        default_value='relu',
+    )
+
+    # backbone architecture
+    search_space_updates.append(
+        node_name='network_backbone',
+        hyperparameter='__choice__',
+        value_range=['MLPBackbone'],
+        default_value='MLPBackbone',
+    )
+    search_space_updates.append(
+        node_name='network_backbone',
+        hyperparameter='MLPBackbone:num_groups',
+        value_range=(2, 4),
+        default_value=4,
+    )
+    search_space_updates.append(
+        node_name='network_backbone',
+        hyperparameter='MLPBackbone:num_units',
+        value_range=[128, 512],
+        default_value=128,
+        log=True
+    )
+    search_space_updates.append(
+        node_name='network_backbone',
+        hyperparameter='MLPBackbone:dropout',
+        value_range=(0.1, 0.5),
+        default_value=0.1,
+    )
+    search_space_updates.append(
+        node_name='network_backbone',
+        hyperparameter='MLPBackbone:activation',
+        value_range=['relu', 'elu'],
+        default_value='relu',
+    )
+
+    # training updates
+    search_space_updates.append(
+        node_name='lr_scheduler',
+        hyperparameter='__choice__',
+        value_range=['NoScheduler'],
+        default_value='NoScheduler',
+    )
+    search_space_updates.append(
+        node_name='optimizer',
+        hyperparameter='__choice__',
+        value_range=['AdamOptimizer', 'SGDOptimizer'],
+        default_value='AdamOptimizer',
+    )
+    search_space_updates.append(
+        node_name='optimizer',
+        hyperparameter='AdamOptimizer:lr',
+        value_range=[1e-4, 3e-2],
+        default_value=3e-4,
+    )
+    search_space_updates.append(
+        node_name='optimizer',
+        hyperparameter='AdamOptimizer:weight_decay',
+        value_range=(1E-12, 0.1),
+        default_value=1e-6,
+    )
+    search_space_updates.append(
+        node_name='data_loader',
+        hyperparameter='max_batch_size',
+        value_range=[512],
+        default_value=512,
+    )
+
+    # preprocessing
+    search_space_updates.append(
+        node_name='feature_preprocessor',
+        hyperparameter='__choice__',
+        value_range=['NoFeaturePreprocessor'],
+        default_value='NoFeaturePreprocessor',
+    )
+
+    if has_numerical_features:
+        search_space_updates.append(
+            node_name='imputer',
+            hyperparameter='numerical_strategy',
+            value_range=['median', 'mean', 'most_frequent'],
+            default_value='median',
+        )
+        search_space_updates.append(
+            node_name='scaler',
+            hyperparameter='__choice__',
+            value_range=['StandardScaler'],
+            default_value='StandardScaler',
+        )
+        # preprocessing
+        search_space_updates.append(
+            node_name='skew_transformer',
+            hyperparameter='__choice__',
+            value_range=['QuantileTransformer'],
+            default_value='QuantileTransformer',
+        )
+
+    if has_cat_features:
+        search_space_updates.append(
+            node_name='encoder',
+            hyperparameter='__choice__',
+            value_range=['OneHotEncoder', 'NoEncoder'],
+            default_value='OneHotEncoder',
+        )
+        search_space_updates.append(
+            node_name="network_embedding",
+            hyperparameter="__choice__",
+            value_range=('NoEmbedding', 'LearnedEntityEmbedding'),
+            default_value='LearnedEntityEmbedding'
+        )
+
+    return search_space_updates
+
+
+def get_config_from_run_history(run_history: RunHistory, num_run: int):
+    for _, run_value in run_history.data.items():
+        if run_value.additional_info.get('num_run', -1) == num_run:  # to ensure that unsuccessful configs are not returned
+            return run_value.additional_info['configuration']
+    
\ No newline at end of file
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index c2d3b1c91..3b0a45d01 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -46,7 +46,7 @@ def __init__(
 
         # Required for dataset properties
         self.num_features: Optional[int] = None
-        self.categories: List[List[int]] = []
+        self.num_categories_per_col: List[List[int]] = []
         self.categorical_columns: List[int] = []
         self.numerical_columns: List[int] = []
 
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index af7932557..c17bd4416 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -98,8 +98,10 @@ class TabularFeatureValidator(BaseFeatureValidator):
     def __init__(
         self,
         logger: Optional[Union[PicklableClientLogger, Logger]] = None,
+        feat_type: Optional[List[str]] = None
     ):
         super().__init__(logger)
+        self.feat_type = feat_type
 
     @staticmethod
     def _comparator(cmp1: str, cmp2: str) -> int:
@@ -168,7 +170,10 @@ def _fit(
             self.dtypes = [dt.name for dt in X.dtypes]  # Also note this change in self.dtypes
             self.all_nan_columns = set(all_nan_columns)
 
-            self.enc_columns, self.feat_type = self._get_columns_info(X)
+            if self.feat_type is not None:
+                self.enc_columns = [X.columns[i] for i, col in enumerate(self.feat_type) if col.lower() == 'categorical']
+            else:
+                self.enc_columns, self.feat_type = self._get_columns_info(X)
 
             if len(self.enc_columns) > 0:
 
@@ -193,10 +198,7 @@ def _fit(
                 encoded_categories = self.column_transformer.\
                     named_transformers_['categorical_pipeline'].\
                     named_steps['ordinalencoder'].categories_
-                self.categories = [
-                    list(range(len(cat)))
-                    for cat in encoded_categories
-                ]
+                self.num_categories_per_col = [len(cat) for cat in encoded_categories]
 
             # differently to categorical_columns and numerical_columns,
             # this saves the index of the column.
diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
index 492327fbe..347708d92 100644
--- a/autoPyTorch/data/tabular_validator.py
+++ b/autoPyTorch/data/tabular_validator.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -48,12 +48,14 @@ def __init__(
         logger_port: Optional[int] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
         seed: int = 42,
+        feat_type: Optional[List[str]] = None
     ):
         self.dataset_compression = dataset_compression
         self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         self.is_classification = is_classification
         self.logger_port = logger_port
         self.seed = seed
+        self.feat_type = feat_type
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
                 name='Validation',
@@ -63,7 +65,8 @@ def __init__(
             self.logger = logging.getLogger('Validation')
 
         self.feature_validator = TabularFeatureValidator(
-            logger=self.logger)
+            logger=self.logger,
+            feat_type=self.feat_type)
         self.target_validator = TabularTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index baea81680..36729c807 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -25,7 +25,10 @@
     NoResamplingFunc,
     NoResamplingFuncs,
     NoResamplingStrategyTypes,
-    ResamplingStrategies
+    ResamplingStrategies,
+    RepeatedCrossValFunc,
+    RepeatedCrossValFuncs,
+    RepeatedCrossValTypes
 )
 from autoPyTorch.utils.common import FitRequirement, ispandas
 
@@ -154,6 +157,7 @@ def __init__(
         self.cross_validators: Dict[str, CrossValFunc] = {}
         self.holdout_validators: Dict[str, HoldOutFunc] = {}
         self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
+        self.repeated_cross_validators: Dict[str, RepeatedCrossValFunc] = {}
         self.random_state = np.random.RandomState(seed=seed)
         self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
@@ -167,7 +171,7 @@ def __init__(
         # Make sure cross validation splits are created once
         self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
         self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)
-
+        self.repeated_cross_validators = RepeatedCrossValFuncs.get_repeated_cross_validators(*RepeatedCrossValTypes)
         self.no_resampling_validators = NoResamplingFuncs.get_no_resampling_validators(*NoResamplingStrategyTypes)
 
         self.splits = self.get_splits_from_resampling_strategy()
@@ -237,12 +241,12 @@ def __len__(self) -> int:
     def _get_indices(self) -> np.ndarray:
         return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
 
-    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]:
+    def get_splits_from_resampling_strategy(self) -> List[List[Tuple[List[int], Optional[List[int]]]]]:
         """
         Creates a set of splits based on a resampling strategy provided
 
         Returns
-            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+            (List[List[Tuple[List[int], Optional[List[int]]]]]): splits in the [train_indices, val_indices] format
         """
         splits = []
         if isinstance(self.resampling_strategy, HoldoutValTypes):
@@ -251,10 +255,12 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[
             if self.resampling_strategy_args is not None:
                 val_share = self.resampling_strategy_args.get('val_share', val_share)
             splits.append(
-                self.create_holdout_val_split(
-                    holdout_val_type=self.resampling_strategy,
-                    val_share=val_share,
-                )
+                [
+                    self.create_holdout_val_split(
+                        holdout_val_type=self.resampling_strategy,
+                        val_share=val_share,
+                    )
+                ]
             )
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
@@ -262,15 +268,32 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[
             if self.resampling_strategy_args is not None:
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
             # Create the split if it was not created before
-            splits.extend(
+            splits.append(
                 self.create_cross_val_splits(
                     cross_val_type=self.resampling_strategy,
                     num_splits=cast(int, num_splits),
                 )
             )
+        elif isinstance(self.resampling_strategy, RepeatedCrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'num_splits', None)
+            num_repeats = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'num_repeats', None
+            )
+            if self.resampling_strategy_args is not None:
+                num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
+                num_repeats = self.resampling_strategy_args.get('num_repeats', num_splits)
+            # Create the split if it was not created before
+            splits.extend(
+                self.create_repeated_cross_val_splits(
+                    repeated_cross_val_type=self.resampling_strategy,
+                    num_splits=cast(int, num_splits),
+                    num_repeats=cast(int, num_repeats)
+                )
+            )
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
-            splits.append((self.no_resampling_validators[self.resampling_strategy.name](self.random_state,
-                                                                                        self._get_indices()), None))
+            splits.append([(self.no_resampling_validators[self.resampling_strategy.name](self.random_state,
+                                                                                        self._get_indices()), None)])
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
         return splits
@@ -307,6 +330,38 @@ def create_cross_val_splits(
             self.random_state, num_splits, self._get_indices(), **kwargs)
         return splits
 
+    def create_repeated_cross_val_splits(
+        self,
+        repeated_cross_val_type: RepeatedCrossValTypes,
+        num_splits: int,
+        num_repeats: int
+    ) -> List[List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]]:
+        """
+        This function creates the cross validation split for the given task.
+        It is done once per dataset to have comparable results among pipelines
+        Args:
+            repeated_cross_val_type (RepeatedCrossValTypes):
+            num_splits (int): number of splits to be created
+            num_repeats (int): number of repeats of splits to be created
+        Returns:
+            (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
+                list containing 'num_splits' splits.
+        """
+        # Create just the split once
+        # This is gonna be called multiple times, because the current dataset
+        # is being used for multiple pipelines. That is, to be efficient with memory
+        # we dump the dataset to memory and read it on a need basis. So this function
+        # should be robust against multiple calls, and it does so by remembering the splits
+        if not isinstance(repeated_cross_val_type, RepeatedCrossValTypes):
+            raise NotImplementedError(f'The selected `repeated_cross_val_type` "{repeated_cross_val_type}" is not implemented.')
+        kwargs = {}
+        if repeated_cross_val_type.is_stratified():
+            # we need additional information about the data for stratification
+            kwargs["stratify"] = self.train_tensors[-1]
+        splits = self.repeated_cross_validators[repeated_cross_val_type.name](
+            random_state=self.random_state, num_splits=num_splits, num_repeats=num_repeats, indices=self._get_indices(), **kwargs)
+        return splits
+
     def create_holdout_val_split(
         self,
         holdout_val_type: HoldoutValTypes,
@@ -342,7 +397,7 @@ def create_holdout_val_split(
             self.random_state, val_share, self._get_indices(), **kwargs)
         return train, val
 
-    def get_dataset(self, split_id: int, train: bool) -> Dataset:
+    def get_dataset(self, split_id: int, train: bool, repeat_id: int = 0) -> Dataset:
         """
         The above split methods employ the Subset to internally subsample the whole dataset.
 
@@ -358,10 +413,14 @@ def get_dataset(self, split_id: int, train: bool) -> Dataset:
             Dataset: the reduced dataset to be used for testing
         """
         # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple
-        if split_id >= len(self.splits):  # old version: split_id > len(self.splits)
-            raise IndexError(f"self.splits index out of range, got split_id={split_id}"
-                             f" (>= num_splits={len(self.splits)})")
-        indices = self.splits[split_id][int(not train)]  # 0: for training, 1: for evaluation
+        if repeat_id >= len(self.splits):
+            raise IndexError("repeat_id out of range, got repeat_id={}"
+                             " (>= num_repeats={})".format(split_id, len(self.splits)))
+        if split_id >= len(self.splits[repeat_id]):
+            raise IndexError("split_id out of range, got split_id={}"
+                             " (>= num_splits={})".format(split_id, len(self.splits[repeat_id])))
+        subset = int(not train)
+        indices = self.splits[repeat_id][split_id][subset]
         if indices is None:
             raise ValueError("Specified fold (or subset) does not exist")
 
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 78447a04e..12750d8a4 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -9,7 +9,9 @@
     StratifiedKFold,
     StratifiedShuffleSplit,
     TimeSeriesSplit,
-    train_test_split
+    train_test_split,
+    RepeatedKFold,
+    RepeatedStratifiedKFold
 )
 
 from typing_extensions import Protocol
@@ -39,6 +41,16 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
         ...
 
 
+class RepeatedCrossValFunc(Protocol):
+    def __call__(self,
+                 random_state: np.random.RandomState,
+                 num_splits: int,
+                 num_repeats: int,
+                 indices: np.ndarray,
+                 stratify: Optional[Any]) -> List[List[Tuple[np.ndarray, np.ndarray]]]:
+        ...
+
+
 class CrossValTypes(IntEnum):
     """The type of cross validation
 
@@ -90,8 +102,29 @@ def is_stratified(self) -> bool:
         return False
 
 
+class RepeatedCrossValTypes(IntEnum):
+    """The type of repeated cross validation
+    This class is used to specify the cross validation function
+    and is not supposed to be instantiated.
+    Examples: This class is supposed to be used as follows
+    >>> cv_type = RepeatedCrossValTypes.repeated_k_fold_cross_validation
+    >>> print(cv_type.name)
+    repeated_k_fold_cross_validation
+    >>> for cross_val_type in CrossValTypes:
+            print(cross_val_type.name, cross_val_type.value)
+    stratified_repeated_k_fold_cross_validation 1
+    repeated_k_fold_cross_validation 2
+    """
+    stratified_repeated_k_fold_cross_validation = 1
+    repeated_k_fold_cross_validation = 2
+
+    def is_stratified(self) -> bool:
+        stratified = [self.stratified_repeated_k_fold_cross_validation]
+        return getattr(self, self.name) in stratified
+
+
 # TODO: replace it with another way
-ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
+ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes, RepeatedCrossValTypes]
 
 DEFAULT_RESAMPLING_PARAMETERS: Dict[
     ResamplingStrategies,
@@ -115,7 +148,11 @@ def is_stratified(self) -> bool:
     CrossValTypes.time_series_cross_validation: {
         'num_splits': 5,
     },
-    NoResamplingStrategyTypes.no_resampling: {}
+    NoResamplingStrategyTypes.no_resampling: {},
+    RepeatedCrossValTypes.repeated_k_fold_cross_validation: {
+        'num_splits': 2,
+        'num_repeats': 2
+    },
 }
 
 
@@ -270,3 +307,51 @@ def no_resampling(random_state: np.random.RandomState,
             np.ndarray: array of indices
         """
         return indices
+
+
+# TODO: Add resampling strategy for stacking, depends on the choice of implementation
+class RepeatedCrossValFuncs:
+    @staticmethod
+    def repeated_k_fold_cross_validation(random_state: np.random.RandomState,
+                                       num_splits: int,
+                                       num_repeats: int,
+                                       indices: np.ndarray,
+                                       **kwargs: Any
+                                       ) -> List[List[Tuple[np.ndarray, np.ndarray]]]:
+        cv = RepeatedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
+
+        tmp_splits = list(cv.split(indices))
+        splits = []
+        for i in range(num_repeats):
+            folds = []
+            for j in range(num_splits):
+                folds.append(tmp_splits[i*num_splits + j])
+            splits.append(folds)
+        return splits
+
+    @staticmethod
+    def stratified_repeated_k_fold_cross_validation(random_state: np.random.RandomState,
+                                                  num_splits: int,
+                                                  num_repeats: int,
+                                                  indices: np.ndarray,
+                                                  **kwargs: Any
+                                                  ) -> List[List[Tuple[np.ndarray, np.ndarray]]]:
+        cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
+        y=kwargs["stratify"]
+        tmp_splits = list(cv.split(indices, y[indices]))
+        splits = []
+        for i in range(num_repeats):
+            folds = []
+            for j in range(num_splits):
+                folds.append(tmp_splits[i*num_splits + j])
+            splits.append(folds)
+        return splits
+
+    @classmethod
+    def get_repeated_cross_validators(cls, *repeated_cross_validator_types: RepeatedCrossValTypes
+                                     ) -> Dict[str, RepeatedCrossValFunc]:
+        repeated_cross_validators: Dict[str, RepeatedCrossValFunc] = {
+            repeated_cross_validator.name: getattr(cls, repeated_cross_validator.name)
+            for repeated_cross_validator in repeated_cross_validator_types
+        }
+        return repeated_cross_validators
diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
index 6cabfe525..5a15e759b 100644
--- a/autoPyTorch/datasets/tabular_dataset.py
+++ b/autoPyTorch/datasets/tabular_dataset.py
@@ -16,7 +16,7 @@
     TABULAR_REGRESSION,
     TASK_TYPES_TO_STRING,
 )
-from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
@@ -65,7 +65,7 @@ def __init__(self,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  dataset_name: Optional[str] = None,
-                 validator: Optional[BaseInputValidator] = None,
+                 validator: Optional[TabularInputValidator] = None,
                  ):
 
         # Take information from the validator, which guarantees clean data for the
@@ -81,7 +81,8 @@ def __init__(self,
         self.categorical_columns = validator.feature_validator.categorical_columns
         self.numerical_columns = validator.feature_validator.numerical_columns
         self.num_features = validator.feature_validator.num_features
-        self.categories = validator.feature_validator.categories
+        self.num_categories_per_col = validator.feature_validator.num_categories_per_col
+        self.feat_type = validator.feature_validator.feat_type
 
         super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
                          resampling_strategy=resampling_strategy,
diff --git a/autoPyTorch/datasets/utils.py b/autoPyTorch/datasets/utils.py
new file mode 100644
index 000000000..aaa5d8df2
--- /dev/null
+++ b/autoPyTorch/datasets/utils.py
@@ -0,0 +1,48 @@
+from typing import Dict, List, Optional
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.resampling_strategy import ResamplingStrategies
+from autoPyTorch.constants import (
+    STRING_TO_TASK_TYPES,
+    CLASSIFICATION_TASKS,
+)
+from autoPyTorch.utils.data_classes import get_data_validator_class, get_dataset_class
+
+
+def get_appended_dataset(
+    original_dataset: BaseDataset,
+    previous_layer_predictions_train: List[Optional[np.ndarray]],
+    previous_layer_predictions_test: List[Optional[np.ndarray]],
+    resampling_strategy: ResamplingStrategies,
+    resampling_strategy_args: Optional[Dict]
+    ) -> BaseDataset:
+
+    X_train, y_train = original_dataset.train_tensors
+    X_test, y_test = original_dataset.test_tensors
+    
+    X_train = pd.DataFrame(np.concatenate([X_train, *previous_layer_predictions_train], axis=1))
+    X_test = pd.DataFrame(np.concatenate([X_test, *previous_layer_predictions_test], axis=1))
+
+    new_feat_types: List[str] = original_dataset.feat_type.copy()
+    new_feat_types.extend(['numerical'] * (original_dataset.num_classes * len(previous_layer_predictions_train)))
+    validator: BaseInputValidator = get_data_validator_class(original_dataset.task_type)(
+        is_classification=STRING_TO_TASK_TYPES[original_dataset.task_type] in CLASSIFICATION_TASKS,
+        feat_type=new_feat_types)
+    validator.fit(X_train, y_train, X_test=X_test, y_test=y_test)
+
+    dataset = get_dataset_class(original_dataset.task_type)(
+        X=X_train,
+        Y=y_train,
+        X_test=X_test,
+        Y_test=y_test,
+        validator=validator,
+        resampling_strategy=resampling_strategy,
+        resampling_strategy_args=resampling_strategy_args)
+
+    return dataset
+
diff --git a/autoPyTorch/ensemble/autogluon_stacking_ensemble.py b/autoPyTorch/ensemble/autogluon_stacking_ensemble.py
new file mode 100644
index 000000000..4f255b4ab
--- /dev/null
+++ b/autoPyTorch/ensemble/autogluon_stacking_ensemble.py
@@ -0,0 +1,158 @@
+from collections import Counter
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+
+
+class AutogluonStackingEnsemble(AbstractEnsemble):
+    def __init__(
+        self,
+    ) -> None:
+        self.ensemble_identifiers: Optional[List[List[Tuple[int, int, float]]]] = None
+        self.ensemble_weights: Optional[List[List]] = None
+
+    def fit(
+        self,
+        identifiers: List[List[Tuple[int, int, float]]],
+        weights: List[List]
+    ) -> AbstractEnsemble:
+        """
+        Builds a ensemble given the individual models out of fold predictions.
+        Fundamentally, defines a set of weights on how to perform a soft-voting
+        aggregation of the models in the given identifiers.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of individual model predictions of shape (n_datapoints, n_targets)
+                corresponding to the OutOfFold estimate of the ground truth
+            labels (np.ndarray):
+                The ground truth targets of shape (n_datapoints, n_targets)
+            identifiers: List[Tuple[int, int, float]]
+                A list of model identifiers, each with the form
+                (seed, number of run, budget)
+
+        Returns:
+            A copy of self
+        """
+        self.ensemble_identifiers = identifiers
+        self.ensemble_weights = weights
+        return self
+
+    def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
+        """
+        Given a list of predictions from the individual model, this method
+        aggregates the predictions using a soft voting scheme with the weights
+        found during training.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of predictions from the individual base models.
+
+        Returns:
+            average (np.ndarray): Soft voting predictions of ensemble models, using
+                                the weights found during ensemble selection (self._weights)
+        """
+
+        average = np.zeros_like(predictions[0], dtype=np.float64)
+        tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)
+
+        # if predictions.shape[0] == len(self.weights_),
+        # predictions include those of zero-weight models.
+        if len(predictions) == len(self.ensemble_weights[-1]):
+            for pred, weight in zip(predictions, self.ensemble_weights[-1]):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # if prediction model.shape[0] == len(non_null_weights),
+        # predictions do not include those of zero-weight models.
+        elif len(predictions) == np.count_nonzero(self.ensemble_weights[-1]):
+            non_null_weights = [w for w in self.ensemble_weights[-1] if w > 0]
+            for pred, weight in zip(predictions, non_null_weights):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # If none of the above applies, then something must have gone wrong.
+        else:
+            raise ValueError("The dimensions of ensemble predictions"
+                             " and ensemble weights do not match!")
+        del tmp_predictions
+        return average
+
+    def __str__(self) -> str:
+        return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
+               '\n\tWeights: %s\n\tIdentifiers: %s' % \
+               (' '.join(['%d: %5f' % (idx, performance)
+                         for idx, performance in enumerate(self.trajectory_)]),
+                self.indices_, self.ensemble_weights[-1],
+                ' '.join([str(identifier) for idx, identifier in
+                          enumerate(self.identifiers_)
+                          if self.ensemble_weights[-1][idx] > 0]))
+
+    def get_models_with_weights(
+        self,
+        models: Dict[Any, BasePipeline]
+    ) -> List[Tuple[float, BasePipeline]]:
+        """
+        Handy function to tag the provided input models with a given weight.
+
+        Args:
+            models (List[Tuple[float, BasePipeline]]):
+                A dictionary that maps a model's name to it's actual python object.
+
+        Returns:
+            output (List[Tuple[float, BasePipeline]]):
+                each model with the related weight, sorted by ascending
+                performance. Notice that ensemble selection solves a minimization
+                problem.
+        """
+        outputs = []
+        for layer_models, identifiers, layer_weights in zip(models, self.ensemble_identifiers, self.ensemble_weights):
+            output = []
+            for identifier, weight in zip(identifiers, layer_weights):
+                model = layer_models[identifier]
+                output.append((weight, model))
+            output.sort(reverse=True, key=lambda t: t[0])
+            outputs.append(output)
+
+        return outputs
+
+    def get_expanded_layer_stacking_ensemble_predictions(
+        self,
+        stacking_layer,
+        raw_stacking_layer_ensemble_predictions
+    ) -> List[np.ndarray]:
+        layer_weights = self.ensemble_weights[stacking_layer]
+        layer_size = len(self.ensemble_weights[stacking_layer])
+        ensemble_predictions = []
+        for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions):
+            ensemble_predictions.extend([pred] * int(weight * layer_size))
+        return ensemble_predictions
+
+    def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
+        """
+        After training of ensemble selection, not all models will be used.
+        Some of them will have zero weight. This procedure filters this models
+        out.
+
+        Returns:
+            output (List[Tuple[int, int, float]]):
+                The models actually used by ensemble selection
+        """
+        return self.ensemble_identifiers
+
+    def get_validation_performance(self) -> float:
+        """
+        Returns the best optimization performance seen during hill climbing
+
+        Returns:
+            (float):
+                best ensemble training performance
+        """
+        return 0
+
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index ea2b77c97..1d075e151 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -59,6 +59,9 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         unit_test: bool = False,
+        initial_num_run: int = 0,
+        num_stacking_layers: Optional[int] = None,
+        use_ensemble_opt_loss = False
     ):
         """
             Constructor
@@ -125,6 +128,7 @@ def __init__(
         self.ensemble_size = ensemble_size
         self.performance_range_threshold = performance_range_threshold
 
+        self.initial_num_run = initial_num_run
         if isinstance(ensemble_nbest, numbers.Integral) and ensemble_nbest < 1:
             raise ValueError("Integer ensemble_nbest has to be larger 1: %s" %
                              ensemble_nbest)
@@ -593,6 +597,8 @@ def compute_loss_per_model(self) -> bool:
         # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
         # as a returning list, so as a work-around we skip next line
         for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
+            if _num_run < self.initial_num_run:
+                continue
             if self.read_at_most and n_read_files >= self.read_at_most:
                 # limit the number of files that will be read
                 # to limit memory consumption
@@ -649,7 +655,6 @@ def compute_loss_per_model(self) -> bool:
                         os.path.getmtime(y_ens_fn),
                     )
 
-                self.logger.debug(f"keys in losses {losses.keys()}")
                 self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
 
                 # It is not needed to create the object here
@@ -1107,7 +1112,7 @@ def _get_list_of_sorted_preds(self) -> List[Tuple[str, float, int]]:
             # We want small num_run first
             key=lambda x: (x[1], x[2]),
         ))
-        self.logger.debug(f"Selected keys: {sorted_keys}")
+        # self.logger.debug(f"Selected keys: {sorted_keys}")
         return sorted_keys
 
     def _delete_excess_models(self, selected_keys: List[str]) -> None:
@@ -1130,6 +1135,8 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None:
             # Don't waste time if not enough models to delete
             return
 
+        self.logger.debug(f"num sorted_keys before delete: {len(sorted_keys)}, pred files: {len(self.y_ens_files)}")
+
         # The top self.max_resident_models models would be the candidates
         # Any other low performance model will be deleted
         # The list is in ascending order of score
@@ -1154,7 +1161,8 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None:
             _budget = float(match.group(3))
 
             # Do not delete the dummy prediction
-            if _num_run == 1:
+            if _num_run == 1 or _num_run < self.initial_num_run:
+                self.logger.debug(f"skipping for numrun {_num_run}")
                 continue
 
             numrun_dir = self.backend.get_numrun_directory(_seed, _num_run, _budget)
diff --git a/autoPyTorch/ensemble/ensemble_builder_manager.py b/autoPyTorch/ensemble/ensemble_builder_manager.py
index 84ef362ba..0e22f4c96 100644
--- a/autoPyTorch/ensemble/ensemble_builder_manager.py
+++ b/autoPyTorch/ensemble/ensemble_builder_manager.py
@@ -19,10 +19,10 @@
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import BINARY
-from autoPyTorch.ensemble.utils import get_ensemble_builder_class
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes, get_ensemble_builder_class
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.logging_ import get_named_client_logger
-
+from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
 
 class EnsembleBuilderManager(IncorporateRunResultCallback):
     def __init__(
@@ -37,7 +37,7 @@ def __init__(
         opt_metric: str,
         ensemble_size: int,
         ensemble_nbest: int,
-        ensemble_method: int,
+        ensemble_method: EnsembleSelectionTypes,
         max_models_on_disc: Union[float, int],
         seed: int,
         precision: int,
@@ -47,7 +47,10 @@ def __init__(
         random_state: int,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         pynisher_context: str = 'fork',
-        use_ensemble_loss=False
+        initial_num_run: int = 0,
+        use_ensemble_loss=False,
+        num_stacking_layers: Optional[int] = None,
+        iteration=0
     ):
         """ SMAC callback to handle ensemble building
         Args:
@@ -114,6 +117,11 @@ def __init__(
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
         self.ensemble_method = ensemble_method
+        self.cur_stacking_layer = 0 if self.ensemble_method.is_stacking_ensemble() else None
+        if self.ensemble_method.is_stacking_ensemble() and num_stacking_layers is None:
+            raise ValueError("Cant be none for stacked ensembles")
+
+        self.num_stacking_layers = num_stacking_layers
         self.max_models_on_disc: Union[float, int] = max_models_on_disc
         self.seed = seed
         self.precision = precision
@@ -124,6 +132,7 @@ def __init__(
         self.logger_port = logger_port
         self.pynisher_context = pynisher_context
 
+        self.is_new_layer = False
         # Store something similar to SMAC's runhistory
         self.history: List[Dict[str, float]] = []
 
@@ -131,12 +140,13 @@ def __init__(
         self.futures: List[dask.Future] = []
 
         # The last criteria is the number of iterations
-        self.iteration = 0
+        self.iteration = iteration
 
         # Keep track of when we started to know when we need to finish!
         self.start_time = time.time()
 
         self.use_ensemble_loss = use_ensemble_loss
+        self.initial_num_run = initial_num_run
 
     def __call__(
         self,
@@ -229,7 +239,11 @@ def build_ensemble(
                     pynisher_context=self.pynisher_context,
                     logger_port=self.logger_port,
                     unit_test=unit_test,
-                    use_ensemble_opt_loss=self.use_ensemble_loss
+                    use_ensemble_opt_loss=self.use_ensemble_loss,
+                    cur_stacking_layer=self.cur_stacking_layer,
+                    is_new_layer=self.is_new_layer,
+                    num_stacking_layers=self.num_stacking_layers,
+                    initial_num_run=self.initial_num_run
                 ))
 
                 logger.info(
@@ -243,12 +257,23 @@ def build_ensemble(
                     ),
                 )
                 self.iteration += 1
+                # reset to False so only signal from smbo sets is_new_layer = True
+                self.is_new_layer = False
             except Exception as e:
                 exception_traceback = traceback.format_exc()
                 error_message = repr(e)
                 logger.critical(exception_traceback)
                 logger.critical(error_message)
 
+    def update_for_new_stacking_layer(self, cur_stacking_layer: int, initial_num_run: int) -> None:
+        if cur_stacking_layer >= self.num_stacking_layers:
+            raise ValueError(f"Unexpected value '{cur_stacking_layer}' for cur_stacking_layer. "
+                             f"Max stacking layers are : {self.num_stacking_layers}.")
+        self.cur_stacking_layer = cur_stacking_layer
+        self.iteration = 0
+        self.initial_num_run = initial_num_run
+        self.is_new_layer = True
+
 
 def fit_and_return_ensemble(
     backend: Backend,
@@ -259,7 +284,7 @@ def fit_and_return_ensemble(
     opt_metric: str,
     ensemble_size: int,
     ensemble_nbest: int,
-    ensemble_method: int,
+    ensemble_method: EnsembleSelectionTypes,
     max_models_on_disc: Union[float, int],
     seed: int,
     precision: int,
@@ -272,7 +297,11 @@ def fit_and_return_ensemble(
     pynisher_context: str,
     logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     unit_test: bool = False,
-    use_ensemble_opt_loss=False
+    use_ensemble_opt_loss=False,
+    cur_stacking_layer: Optional[int] = None,
+    is_new_layer: bool = False,
+    num_stacking_layers: Optional[int] = None,
+    initial_num_run: int = 0,
 ) -> Tuple[
         List[Dict[str, float]],
         int,
@@ -340,6 +369,18 @@ def fit_and_return_ensemble(
             [[pandas_timestamp, train_performance, val_performance, test_performance], ...]
     """
     ensemble_builder = get_ensemble_builder_class(ensemble_method)
+    ensemble_builder_run_kwargs = {
+        'end_at': end_at,
+        'iteration': iteration,
+        'return_predictions': return_predictions,
+        'pynisher_context': pynisher_context,
+    }
+    if ensemble_method.is_stacking_ensemble() and ensemble_method != EnsembleSelectionTypes.stacking_repeat_models:
+        ensemble_builder_run_kwargs.update({'cur_stacking_layer': cur_stacking_layer})
+
+    if ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer:
+        ensemble_builder_run_kwargs.update({'is_new_layer': is_new_layer})
+
     result = ensemble_builder(
         backend=backend,
         dataset_name=dataset_name,
@@ -357,11 +398,10 @@ def fit_and_return_ensemble(
         random_state=random_state,
         logger_port=logger_port,
         unit_test=unit_test,
-        use_ensemble_opt_loss=use_ensemble_opt_loss
+        use_ensemble_opt_loss=use_ensemble_opt_loss,
+        num_stacking_layers=num_stacking_layers,
+        initial_num_run=initial_num_run
     ).run(
-        end_at=end_at,
-        iteration=iteration,
-        return_predictions=return_predictions,
-        pynisher_context=pynisher_context,
+        **ensemble_builder_run_kwargs
     )
     return result
diff --git a/autoPyTorch/ensemble/stacking_ensemble.py b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py
similarity index 80%
rename from autoPyTorch/ensemble/stacking_ensemble.py
rename to autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py
index 40ca5bc98..66541ad51 100644
--- a/autoPyTorch/ensemble/stacking_ensemble.py
+++ b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble.py
@@ -12,20 +12,26 @@
 
 
 # TODO: Think of functionality of the functions in this class adjusted for stacking.
-class StackingEnsemble(AbstractEnsemble):
+class EnsembleOptimisationStackingEnsemble(AbstractEnsemble):
     def __init__(
         self,
         ensemble_size: int,
         metric: autoPyTorchMetric,
         task_type: int,
         random_state: np.random.RandomState,
-        ensemble_slot_j: int
+        ensemble_slot_j: int,
+        cur_stacking_layer: int,
+        stacked_ensemble_identifiers: List[List[Optional[Tuple[int, int, float]]]],
+        predictions_stacking_ensemble: List[List[Dict[str, Optional[np.ndarray]]]]
     ) -> None:
         self.ensemble_size = ensemble_size
         self.metric = metric
         self.random_state = random_state
         self.task_type = task_type
         self.ensemble_slot_j = ensemble_slot_j
+        self.cur_stacking_layer = cur_stacking_layer
+        self.stacked_ensemble_identifiers = stacked_ensemble_identifiers
+        self.predictions_stacking_ensemble = predictions_stacking_ensemble
 
     def __getstate__(self) -> Dict[str, Any]:
         # Cannot serialize a metric if
@@ -41,7 +47,8 @@ def __getstate__(self) -> Dict[str, Any]:
     def fit(
         self,
         predictions_ensemble: List[np.ndarray],
-        best_model_predictions: np.ndarray,
+        best_model_predictions_ensemble: np.ndarray,
+        best_model_predictions_test: np.ndarray,
         labels: np.ndarray,
         ensemble_identifiers: List[Tuple[int, int, float]],
         best_model_identifier: Tuple[int, int, float],
@@ -64,10 +71,15 @@ def fit(
         Returns:
             A copy of self
         """
-        predictions_ensemble[self.ensemble_slot_j] = best_model_predictions
+        predictions_ensemble[self.ensemble_slot_j] = best_model_predictions_ensemble
         ensemble_identifiers[self.ensemble_slot_j] = best_model_identifier
         self._fit(predictions_ensemble, labels)
         self.identifiers_ = ensemble_identifiers
+        self.stacked_ensemble_identifiers[self.cur_stacking_layer] = ensemble_identifiers
+        self.predictions_stacking_ensemble[self.cur_stacking_layer][self.ensemble_slot_j] =  {
+            'ensemble': best_model_predictions_ensemble,
+            'test': best_model_predictions_test
+        }
         self._calculate_weights()
         return self
 
@@ -91,9 +103,10 @@ def _fit(
                 A list of model identifiers, each with the form
                 (seed, number of run, budget)
         """
+        nonnull_predictions = [pred for pred in predictions if pred is not None]
 
         weighted_ensemble_prediction = np.zeros(
-            predictions[0].shape,
+            nonnull_predictions[0].shape,
             dtype=np.float64,
         )
 
@@ -102,7 +115,6 @@ def _fit(
             dtype=np.float64,
         )
 
-        nonnull_predictions = [pred for pred in predictions if pred is not None]
         size = len(nonnull_predictions)
         for pred in nonnull_predictions:
             np.add(
@@ -129,8 +141,6 @@ def _fit(
 
         self.train_loss_: float = loss
 
-    # TODO: return 1 for models in layer 0, 2 for next and so on
-    # TODO: 0 for models that are not in stack
     def _calculate_weights(self) -> None:
         """
         Calculates the contribution each of the individual models
@@ -167,12 +177,13 @@ def _predict(self, predictions, weights):
                                 the weights
         """
 
-        average = np.zeros_like(predictions[0], dtype=np.float64)
-        tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)
+        nonnull_predictions = [pred for pred in predictions if pred is not None]
+        average = np.zeros_like(nonnull_predictions[0], dtype=np.float64)
+        tmp_predictions = np.empty_like(nonnull_predictions[0], dtype=np.float64)
 
         # if prediction model.shape[0] == len(non_null_weights),
         # predictions do not include those of zero-weight models.
-        if len([pred for pred in predictions if pred is not None]) == np.count_nonzero(weights):
+        if len(nonnull_predictions) == np.count_nonzero(weights):
             for pred, weight in zip(predictions, weights):
                 if pred is not None:
                     np.multiply(pred, weight, out=tmp_predictions)
@@ -187,9 +198,17 @@ def _predict(self, predictions, weights):
         return average
 
     def __str__(self) -> str:
-        return f"Ensemble Selection:\n\tWeights: {self.weights_}\
+        return f"Ensemble Optimisation Stacking Ensemble:\n\tWeights: {self.weights_}\
             \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.identifiers_) if self.weights_[idx] > 0])}"
 
+    def get_layer_stacking_ensemble_predictions(
+        self,
+        stacking_layer: int,
+        dataset: str = 'ensemble'
+    ) -> List[Optional[np.ndarray]]:
+
+        return [predictions[dataset] if predictions is not None else None for predictions in self.predictions_stacking_ensemble[stacking_layer]]
+
     def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
         """
         After training of ensemble selection, not all models will be used.
@@ -200,7 +219,7 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
             output (List[Tuple[int, int, float]]):
                 The models actually used by ensemble selection
         """
-        return self.identifiers_
+        return self.stacked_ensemble_identifiers
 
     def get_validation_performance(self) -> float:
         """
@@ -255,13 +274,17 @@ def get_models_with_weights(
                 performance. Notice that ensemble selection solves a minimization
                 problem.
         """
-        output = []
-        for i, weight in enumerate(self.weights_):
-            if weight > 0.0:
-                identifier = self.identifiers_[i]
-                model = models[identifier]
-                output.append((weight, model))
-
-        output.sort(reverse=True, key=lambda t: t[0])
-
-        return output
\ No newline at end of file
+        outputs = []
+        for i, layer_models in enumerate(models):
+            output = []
+            num_models = len(layer_models)
+            if i == len(models):
+                weights = self.weights_
+            else:
+                weights = [1/num_models] * len(models)
+            for weight, model in zip(weights, layer_models):
+                output.append((weight, layer_models[model]))
+            output.sort(reverse=True, key=lambda t: t[0])
+            outputs.append(output)
+
+        return outputs
diff --git a/autoPyTorch/ensemble/stacking_ensemble_builder.py b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py
similarity index 77%
rename from autoPyTorch/ensemble/stacking_ensemble_builder.py
rename to autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py
index 5097835f2..27c9f1527 100644
--- a/autoPyTorch/ensemble/stacking_ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_optimisation_stacking_ensemble_builder.py
@@ -3,11 +3,9 @@
 import logging.handlers
 import os
 import pickle
-import re
 import time
 import traceback
 import warnings
-import zlib
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -16,11 +14,11 @@
 from autoPyTorch.constants import BINARY
 from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
-from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble
+from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score
 from autoPyTorch.utils.logging_ import get_named_client_logger
-
+from autoPyTorch.utils.common import ENSEMBLE_ITERATION_MULTIPLIER
 
 Y_ENSEMBLE = 0
 Y_TEST = 1
@@ -28,7 +26,7 @@
 MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
 
 
-def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) -> float:
+def calculate_nomalised_margin_loss(ensemble_predictions, y_true) -> float:
     n_ensemble = 0
     loss = 0
     for pred in ensemble_predictions:
@@ -41,7 +39,7 @@ def calculate_nomalised_margin_loss(ensemble_predictions, y_true, task_type) ->
     return np.mean(margin)
 
 # TODO: make functions to support stacking.
-class StackingEnsembleBuilder(EnsembleBuilder):
+class EnsembleOptimisationStackingEnsembleBuilder(EnsembleBuilder):
     def __init__(
         self,
         backend: Backend,
@@ -61,7 +59,10 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         unit_test: bool = False,
-        use_ensemble_opt_loss=False
+        use_ensemble_opt_loss=False,
+        num_stacking_layers: int = 2,
+        cur_stacking_layer: int = 0,
+        initial_num_run: int = 0
     ):
         """
             Constructor
@@ -117,7 +118,7 @@ def __init__(
                 better solution, please let us know by opening an issue.
         """
 
-        super(StackingEnsembleBuilder, self).__init__(
+        super(EnsembleOptimisationStackingEnsembleBuilder, self).__init__(
             backend=backend, dataset_name=dataset_name, task_type=task_type,
             output_type=output_type, metrics=metrics, opt_metric=opt_metric,
             ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest,
@@ -125,13 +126,27 @@ def __init__(
             performance_range_threshold=performance_range_threshold,
             seed=seed, precision=precision, memory_limit=memory_limit,
             read_at_most=read_at_most, random_state=random_state,
-            logger_port=logger_port, unit_test=unit_test)
+            logger_port=logger_port, unit_test=unit_test, initial_num_run=initial_num_run)
         # we still need to store ensemble identifiers as this class is not persistant
         # we can do this by either storing and reading them in this class
         # or passing them via the ensemble builder manager which has persistency with the futures stored.
-        self.ensemble_identifiers: Optional[List[Optional[str]]] = None
         self.read_losses = {}
         self.use_ensemble_opt_loss = use_ensemble_opt_loss
+        self.num_stacking_layers = num_stacking_layers
+        self.cur_stacking_layer = cur_stacking_layer
+
+    def run(
+        self,
+        iteration: int,
+        pynisher_context: str,
+        cur_stacking_layer: int,
+        time_left: Optional[float] = None,
+        end_at: Optional[float] = None,
+        time_buffer: int = 5,
+        return_predictions: bool = False,
+        ) -> Tuple[List[Dict[str, float]], int, Optional[np.ndarray], Optional[np.ndarray]]:
+        self.cur_stacking_layer = cur_stacking_layer
+        return super().run(iteration, pynisher_context, time_left, end_at, time_buffer, return_predictions)
 
     # This is the main wrapper to the EnsembleSelection class which fits the ensemble
     def main(
@@ -199,10 +214,17 @@ def main(
             time_left - used_time,
         )
 
+        self.current_ensemble_identifiers = self._load_current_ensemble_identifiers(cur_stacking_layer=self.cur_stacking_layer)
         self.ensemble_slot_j = np.mod(iteration, self.ensemble_size)
-        self.ensemble_identifiers = self._load_ensemble_identifiers()
+        # self.cutoff_num_run = self._load_ensemble_cutoff_num_run()
+        # # checks if we have moved to a new stacking layer.
+        # if self.cutoff_num_run == None:
+        #     self.cutoff_num_run = self.initial_num_run
         self.logger.debug(f"Iteration for ensemble building:{iteration}, "
-                          f"current model to be updated: {self.ensemble_identifiers[self.ensemble_slot_j]} at slot : {self.ensemble_slot_j}")
+                          f"current model to be updated: {self.current_ensemble_identifiers[self.ensemble_slot_j]}"
+                          f" at slot : {self.ensemble_slot_j}"
+                          f" with cur_stacking_layer: {self.cur_stacking_layer}"
+                          f" cut off num run: {self.initial_num_run}")
         # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
         if not self.compute_ensemble_loss_per_model():
             if return_predictions:
@@ -241,12 +263,14 @@ def main(
 
         # Save the ensemble for later use in the main module!
         if ensemble is not None and self.SAVE2DISC:
-            self.backend.save_ensemble(ensemble, iteration, self.seed)
+            self.backend.save_ensemble(ensemble, (self.cur_stacking_layer)*ENSEMBLE_ITERATION_MULTIPLIER + iteration, self.seed)
             ensemble_identifiers=self._get_identifiers_from_num_runs(ensemble.identifiers_)
-            # self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}")
-            self._save_ensemble_identifiers(
-                ensemble_identifiers=ensemble_identifiers
+            self.logger.debug(f"ensemble_identifiers being saved are {ensemble_identifiers}")
+            self._save_current_ensemble_identifiers(
+                ensemble_identifiers=ensemble_identifiers,
+                cur_stacking_layer=self.cur_stacking_layer
                 )
+            self._save_ensemble_cutoff_num_run(cutoff_num_run=self.initial_num_run)
         # Delete files of non-candidate models - can only be done after fitting the ensemble and
         # saving it to disc so we do not accidentally delete models in the previous ensemble
         if self.max_resident_models is not None:
@@ -286,6 +310,7 @@ def main(
         else:
             return self.ensemble_history, self.ensemble_nbest, None, None
 
+    # TODO: change to calculate stacked ensemble loss per model
     def compute_ensemble_loss_per_model(self) -> bool:
         """
             Compute the loss of the predictions on ensemble building data set;
@@ -337,6 +362,11 @@ def compute_ensemble_loss_per_model(self) -> bool:
         # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
         # as a returning list, so as a work-around we skip next line
         for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
+
+            # skip models that were part of previous stacking layer
+            if _num_run < self.initial_num_run:
+                continue
+
             if self.read_at_most and n_read_files >= self.read_at_most:
                 # limit the number of files that will be read
                 # to limit memory consumption
@@ -373,9 +403,10 @@ def compute_ensemble_loss_per_model(self) -> bool:
 
             # actually read the predictions and compute their respective loss
             try:
-                ensemble_idenitfiers = self.ensemble_identifiers.copy()
+                ensemble_idenitfiers = self.current_ensemble_identifiers.copy()
                 ensemble_idenitfiers[self.ensemble_slot_j] = y_ens_fn
                 y_ensemble = self._read_np_fn(y_ens_fn)
+                # self.logger.debug(f"predictions: {y_ensemble}, ensemble_identiifers: {ensemble_idenitfiers}")
                 losses = self.get_ensemble_loss_with_model(
                     model_predictions=y_ensemble,
                     ensemble_identifiers=ensemble_idenitfiers
@@ -412,7 +443,7 @@ def compute_ensemble_loss_per_model(self) -> bool:
     def fit_ensemble(
         self,
         best_model_identifier: str,
-    ) -> Optional[StackingEnsemble]:
+    ) -> Optional[EnsembleOptimisationStackingEnsemble]:
         """
             fit ensemble
 
@@ -426,23 +457,16 @@ def fit_ensemble(
             ensemble: StackingEnsemble
                 trained Ensemble
         """
-
-        assert self.ensemble_identifiers is not None
+        assert self.current_ensemble_identifiers is not None
 
         if self.unit_test:
             raise MemoryError()
 
-        predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.ensemble_identifiers]
-        best_model_predictions = self.read_preds[best_model_identifier][Y_ENSEMBLE]
+        predictions_train = [self.read_preds[k][Y_ENSEMBLE] if k is not None else None for k in self.current_ensemble_identifiers]
+        best_model_predictions_ensemble = self.read_preds[best_model_identifier][Y_ENSEMBLE]
+        best_model_predictions_test = self.read_preds[best_model_identifier][Y_TEST]
 
-        ensemble_num_runs = [
-            (
-                self.read_losses[k]["seed"],
-                self.read_losses[k]["num_run"],
-                self.read_losses[k]["budget"],
-            )
-            if k is not None else None
-            for k in self.ensemble_identifiers]
+        ensemble_num_runs = self._get_num_runs_from_identifiers(self.current_ensemble_identifiers)
 
         best_model_num_run = (
             self.read_losses[best_model_identifier]["seed"],
@@ -450,17 +474,34 @@ def fit_ensemble(
             self.read_losses[best_model_identifier]["budget"],
         )
 
+        stacked_ensemble_identifiers = self._load_stacked_ensemble_identifiers()
+        self.logger.debug(f"Stacked ensemble identifiers: {stacked_ensemble_identifiers}")
+        stacked_ensemble_num_runs = [
+            self._get_num_runs_from_identifiers(layer_identifiers)
+            for layer_identifiers in stacked_ensemble_identifiers
+        ]
+
+        predictions_stacking_ensemble = [
+            [
+                {'ensemble': self.read_preds[k][Y_ENSEMBLE], 'test': self.read_preds[k][Y_TEST]} if k is not None else None for k in layer_identifiers
+            ]
+            for layer_identifiers in stacked_ensemble_identifiers
+        ]
+
         opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0]
         if not opt_metric:
             raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} "
                              "as more than one unique optimization metric was found.")
 
-        ensemble = StackingEnsemble(
+        ensemble = EnsembleOptimisationStackingEnsemble(
             ensemble_size=self.ensemble_size,
             metric=opt_metric,
             random_state=self.random_state,
             task_type=self.task_type,
-            ensemble_slot_j=self.ensemble_slot_j
+            ensemble_slot_j=self.ensemble_slot_j,
+            cur_stacking_layer=self.cur_stacking_layer,
+            stacked_ensemble_identifiers=stacked_ensemble_num_runs,
+            predictions_stacking_ensemble=predictions_stacking_ensemble
         )
 
         try:
@@ -468,11 +509,13 @@ def fit_ensemble(
             #     "Fitting the ensemble on %d models.",
             #     len(predictions_train),
             # )
-            # self.logger.debug(f"predictions sent to ensemble: {predictions_train}")
+            # self.logger.debug(f"predictions sent to ensemble: {predictions_train}, ensemble_num_runs: {ensemble_num_runs}")
+            # self.logger.debug(f"best model predictions: {best_model_predictions_ensemble}, ensemble_slot: {ensemble.ensemble_slot_j}, best_model_num_run: {best_model_num_run}")
             start_time = time.time()
             ensemble.fit(
                 predictions_train, 
-                best_model_predictions,
+                best_model_predictions_ensemble,
+                best_model_predictions_test,
                 self.y_true_ensemble,
                 ensemble_num_runs,
                 best_model_num_run
@@ -535,6 +578,8 @@ def predict(self, set_: str,
 
         predictions = [self.read_preds[k][pred_set] if k is not None else None for k in selected_keys]
 
+        # self.logger.debug(f" in predic(), selected_keys: {selected_keys}"
+        #                   f"predictions sent to ensemble.predict: {predictions}")
         if n_preds == len(predictions):
             y = ensemble.predict(predictions)
             if self.output_type == BINARY:
@@ -622,11 +667,12 @@ def get_ensemble_loss_with_model(self,
                 else:
                     predictions = self.read_preds[identifier][Y_ENSEMBLE]
             else:
-                break
+                predictions=None
 
             ensemble_predictions.append(predictions)
-            np.multiply(predictions, weight, out=tmp_predictions)
-            np.add(average_predictions, tmp_predictions, out=average_predictions)
+            if predictions is not None:
+                np.multiply(predictions, weight, out=tmp_predictions)
+                np.add(average_predictions, tmp_predictions, out=average_predictions)
 
         loss = calculate_loss(
                 metrics=self.metrics,
@@ -634,24 +680,46 @@ def get_ensemble_loss_with_model(self,
                 prediction=average_predictions,
                 task_type=self.task_type,
             )
-        loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble, self.task_type)
+        loss["ensemble_opt_loss"] = calculate_nomalised_margin_loss(ensemble_predictions, self.y_true_ensemble)
         return loss
 
-    def _get_ensemble_identifiers_filename(self):
-        return os.path.join(self.backend.internals_directory, 'ensemble_identifiers.pkl')
+    def _get_ensemble_identifiers_filename(self, cur_stacking_layer) -> str:
+        return os.path.join(self.backend.internals_directory, f'ensemble_identifiers_{cur_stacking_layer}.pkl')
+
+    def _get_ensemble_cutoff_num_run_filename(self):
+        return os.path.join(self.backend.internals_directory, 'ensemble_cutoff_run.txt')
+
+    def _save_ensemble_cutoff_num_run(self, cutoff_num_run: int) -> None:
+        with open(self._get_ensemble_cutoff_num_run_filename(), "w") as file:
+            file.write(str(cutoff_num_run))
 
-    def _save_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]]) -> None:
-        with open(self._get_ensemble_identifiers_filename(), "wb") as file:
+    def _load_ensemble_cutoff_num_run(self) -> Optional[int]:
+        if os.path.exists(self._get_ensemble_cutoff_num_run_filename()):
+            with open(self._get_ensemble_cutoff_num_run_filename(), "r") as file:
+                cutoff_num_run = int(file.read())
+        else:
+            cutoff_num_run = None
+        return cutoff_num_run
+
+    def _save_current_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]], cur_stacking_layer) -> None:
+        with open(self._get_ensemble_identifiers_filename(cur_stacking_layer=cur_stacking_layer), "wb") as file:
             pickle.dump(ensemble_identifiers, file=file)
     
-    def _load_ensemble_identifiers(self) -> List[Optional[str]]:
-        if os.path.exists(self._get_ensemble_identifiers_filename()):
-            with open(self._get_ensemble_identifiers_filename(), "rb") as file:
+    def _load_current_ensemble_identifiers(self, cur_stacking_layer) -> List[Optional[str]]:
+        file_name = self._get_ensemble_identifiers_filename(cur_stacking_layer)
+        if os.path.exists(file_name):
+            with open(file_name, "rb") as file:
                 identifiers = pickle.load(file)
         else:
             identifiers = [None]*self.ensemble_size
         return identifiers
 
+    def _load_stacked_ensemble_identifiers(self) -> List[List[Optional[str]]]:
+        ensemble_identifiers = list()
+        for i in range(self.num_stacking_layers):
+            ensemble_identifiers.append(self._load_current_ensemble_identifiers(cur_stacking_layer=i))
+        return ensemble_identifiers
+
     def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]:
         identifiers: List[Optional[str]] = []
         for num_run in num_runs:
@@ -665,3 +733,18 @@ def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Op
             identifiers.append(identifier)
         return identifiers
 
+    def _get_num_runs_from_identifiers(self, identifiers) -> List[Optional[Tuple[int, int, float]]]:
+        num_runs: List[Optional[Tuple[int, int, float]]] = []
+        for identifier in identifiers:
+            num_run = None
+            if identifier is not None:
+                match = self.model_fn_re.search(identifier)
+                if match is None:
+                    raise ValueError(f"Could not interpret file {identifier} "
+                                    "Something went wrong while scoring predictions")
+                _seed = int(match.group(1))
+                _num_run = int(match.group(2))
+                _budget = float(match.group(3))
+                num_run = (_seed, _num_run, _budget)
+            num_runs.append(num_run)
+        return num_runs
diff --git a/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py
new file mode 100644
index 000000000..a20a8fc9c
--- /dev/null
+++ b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble.py
@@ -0,0 +1,145 @@
+from copyreg import pickle
+from ctypes import cast
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple, Union
+import warnings
+
+import numpy as np
+
+
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+from autoPyTorch.automl_common.common.utils.backend import Backend
+
+# TODO: Think of functionality of the functions in this class adjusted for stacking.
+class EnsembleSelectionPerLayerStackingEnsemble(AbstractEnsemble):
+    def __init__(
+        self,
+        num_stacking_layers,
+        cur_stacking_layer,
+        ensembles = None,
+        ensemble_predictions = None,
+    ) -> None:
+        self.ensembles: List[Optional[AbstractEnsemble]] = [None] * num_stacking_layers if ensembles is None else ensembles
+        self.cur_stacking_layer = cur_stacking_layer
+        self.ensemble_predictions = [None] * num_stacking_layers if ensemble_predictions is None else ensemble_predictions
+
+    # def __getstate__(self) -> Dict[str, Any]:
+    #     # Cannot serialize a metric if
+    #     # it is user defined.
+    #     # That is, if doing pickle dump
+    #     # the metric won't be the same as the
+    #     # one in __main__. we don't use the metric
+    #     # in the EnsembleSelection so this should
+    #     # be fine
+    #     self.metric = None  # type: ignore
+    #     return self.__dict__
+
+    def fit(
+        self,
+        cur_ensemble: AbstractEnsemble,
+        cur_ensemble_predictions,
+    ) -> AbstractEnsemble:
+        """
+        Builds a ensemble given the individual models out of fold predictions.
+        Fundamentally, defines a set of weights on how to perform a soft-voting
+        aggregation of the models in the given identifiers.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of individual model predictions of shape (n_datapoints, n_targets)
+                corresponding to the OutOfFold estimate of the ground truth
+            labels (np.ndarray):
+                The ground truth targets of shape (n_datapoints, n_targets)
+            identifiers: List[Tuple[int, int, float]]
+                A list of model identifiers, each with the form
+                (seed, number of run, budget)
+
+        Returns:
+            A copy of self
+        """
+        self.ensembles[self.cur_stacking_layer] = cur_ensemble
+        self.ensemble_predictions[self.cur_stacking_layer] = cur_ensemble_predictions
+        return self
+
+    def predict(self, predictions: List[np.ndarray]) -> np.ndarray:
+        # should be the last layer
+        return self.ensembles[self.cur_stacking_layer].predict(predictions)
+
+    def __str__(self) -> str:
+        return f"Ensemble Selection Per Layer Stacking Ensemble:\n\tWeights: {self.ensembles[self.cur_stacking_layer].weights_}\
+            \n\tIdentifiers: {' '.join([str(identifier) for idx, identifier in enumerate(self.ensembles[self.cur_stacking_layer].identifiers_) if self.ensembles[self.cur_stacking_layer].weights_[idx] > 0])}"
+
+    def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
+        """
+        After training of ensemble selection, not all models will be used.
+        Some of them will have zero weight. This procedure filters this models
+        out.
+
+        Returns:
+            output (List[Tuple[int, int, float]]):
+                The models actually used by ensemble selection
+        """
+        ensemble_identifiers = list()
+        for ensemble in self.ensembles:
+            if ensemble is None:
+                return ensemble_identifiers
+            ensemble_identifiers.append(ensemble.get_selected_model_identifiers())
+        
+        return ensemble_identifiers
+
+    def get_validation_performance(self) -> float:
+        """
+        Returns the best optimization performance seen during hill climbing
+
+        Returns:
+            (float):
+                best ensemble training performance
+        """
+        return self.ensembles[self.cur_stacking_layer].trajectory_[-1]
+
+    def get_models_with_weights(
+        self,
+        models: Dict[Any, BasePipeline]
+    ) -> List[Tuple[float, BasePipeline]]:
+        """
+        Handy function to tag the provided input models with a given weight.
+
+        Args:
+            models (List[Tuple[float, BasePipeline]]):
+                A dictionary that maps a model's name to it's actual python object.
+
+        Returns:
+            output (List[Tuple[float, BasePipeline]]):
+                each model with the related weight, sorted by ascending
+                performance. Notice that ensemble selection solves a minimization
+                problem.
+        """
+        outputs = []
+        for ensemble, layer_models in zip(self.ensembles, models):
+            outputs.append(ensemble.get_models_with_weights(layer_models))
+
+        return outputs
+
+    def get_expanded_layer_stacking_ensemble_predictions(
+        self,
+        stacking_layer: int,
+        raw_stacking_layer_ensemble_predictions
+    ) -> List[np.ndarray]:
+        layer_weights = [weight for weight in self.ensembles[stacking_layer].weights_ if weight > 0]
+        layer_size = self.ensembles[stacking_layer].ensemble_size
+        ensemble_predictions = []
+        for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions):
+            ensemble_predictions.extend([pred] * int(weight * layer_size))
+        return ensemble_predictions
+
+    def get_layer_stacking_ensemble_predictions(
+        self,
+        stacking_layer: int,
+        dataset: str = 'ensemble'
+    ) -> List[Optional[np.ndarray]]:
+        raw_stacking_layer_ensemble_predictions = self.ensemble_predictions[stacking_layer][dataset]
+
+        return self.get_expanded_layer_stacking_ensemble_predictions(stacking_layer=stacking_layer, raw_stacking_layer_ensemble_predictions=raw_stacking_layer_ensemble_predictions)
diff --git a/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py
new file mode 100644
index 000000000..bb7a868ed
--- /dev/null
+++ b/autoPyTorch/ensemble/ensemble_selection_per_layer_stacking_ensemble_builder.py
@@ -0,0 +1,620 @@
+import glob
+import logging
+import logging.handlers
+import os
+import pickle
+import re
+import time
+import traceback
+import warnings
+from typing import Dict, List, Optional, Tuple, Union
+import zlib
+
+import numpy as np
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import BINARY
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
+from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score
+from autoPyTorch.utils.logging_ import get_named_client_logger
+
+Y_ENSEMBLE = 0
+Y_TEST = 1
+
+MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]+\.*[0-9]*)\.npy'
+
+
+# TODO: make functions to support stacking.
+class EnsembleSelectionPerLayerStackingEnsembleBuilder(EnsembleBuilder):
+    def __init__(
+        self,
+        backend: Backend,
+        dataset_name: str,
+        task_type: int,
+        output_type: int,
+        metrics: List[autoPyTorchMetric],
+        opt_metric: str,
+        ensemble_size: int = 10,
+        ensemble_nbest: int = 100,
+        max_models_on_disc: Union[float, int] = 100,
+        performance_range_threshold: float = 0,
+        seed: int = 1,
+        precision: int = 32,
+        memory_limit: Optional[int] = 1024,
+        read_at_most: int = 5,
+        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+        unit_test: bool = False,
+        use_ensemble_opt_loss=False,
+        num_stacking_layers: int = 2,
+        cur_stacking_layer: int = 0,
+        initial_num_run: int = 0
+    ):
+        """
+            Constructor
+            Parameters
+            ----------
+            backend: util.backend.Backend
+                backend to write and read files
+            dataset_name: str
+                name of dataset
+            task_type: int
+                type of ML task
+            metrics: List[autoPyTorchMetric],
+                name of metric to score predictions
+            opt_metric: str
+                name of the metric to optimize
+            ensemble_size: int
+                maximal size of ensemble (passed to ensemble.ensemble_selection)
+            ensemble_nbest: int/float
+                if int: consider only the n best prediction
+                if float: consider only this fraction of the best models
+                Both wrt to validation predictions
+                If performance_range_threshold > 0, might return less models
+            max_models_on_disc: Union[float, int]
+               Defines the maximum number of models that are kept in the disc.
+               If int, it must be greater or equal than 1, and dictates the max number of
+               models to keep.
+               If float, it will be interpreted as the max megabytes allowed of disc space. That
+               is, if the number of ensemble candidates require more disc space than this float
+               value, the worst models will be deleted to keep within this budget.
+               Models and predictions of the worst-performing models will be deleted then.
+               If None, the feature is disabled.
+               It defines an upper bound on the models that can be used in the ensemble.
+            performance_range_threshold: float
+                Keep only models that are better than:
+                    dummy + (best - dummy)*performance_range_threshold
+                E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3
+                Will at most return the minimum between ensemble_nbest models,
+                and max_models_on_disc. Might return less
+            seed: int
+                random seed
+            precision: [16,32,64,128]
+                precision of floats to read the predictions
+            memory_limit: Optional[int]
+                memory limit in mb. If ``None``, no memory limit is enforced.
+            read_at_most: int
+                read at most n new prediction files in each iteration
+            logger_port: int
+                port that receives logging records
+            unit_test: bool
+                Turn on unit testing mode. This currently makes fit_ensemble raise a MemoryError.
+                Having this is very bad coding style, but I did not find a way to make
+                unittest.mock work through the pynisher with all spawn contexts. If you know a
+                better solution, please let us know by opening an issue.
+        """
+
+        super(EnsembleSelectionPerLayerStackingEnsembleBuilder, self).__init__(
+            backend=backend, dataset_name=dataset_name, task_type=task_type,
+            output_type=output_type, metrics=metrics, opt_metric=opt_metric,
+            ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            performance_range_threshold=performance_range_threshold,
+            seed=seed, precision=precision, memory_limit=memory_limit,
+            read_at_most=read_at_most, random_state=random_state,
+            logger_port=logger_port, unit_test=unit_test, initial_num_run=initial_num_run if cur_stacking_layer==0 else 1)
+
+        self.num_stacking_layers = num_stacking_layers
+        self.cur_stacking_layer = cur_stacking_layer
+        self.ensembles = None
+        self.ensemble_predictions = None
+        old_ensemble: Optional[EnsembleSelectionPerLayerStackingEnsemble] = None
+        if os.path.exists(self.backend.get_ensemble_dir()) and len(os.listdir(self.backend.get_ensemble_dir())) >= 1:
+            old_ensemble = self.backend.load_ensemble(seed=seed)
+            self.ensembles = old_ensemble.ensembles
+            self.ensemble_predictions = old_ensemble.ensemble_predictions
+
+    def run(
+        self,
+        iteration: int,
+        pynisher_context: str,
+        cur_stacking_layer: int,
+        time_left: Optional[float] = None,
+        end_at: Optional[float] = None,
+        time_buffer: int = 5,
+        return_predictions: bool = False,
+        is_new_layer: bool = False,
+        ) -> Tuple[List[Dict[str, float]], int, Optional[np.ndarray], Optional[np.ndarray]]:
+        self.cur_stacking_layer = cur_stacking_layer
+        self.is_new_layer = is_new_layer
+        return super().run(iteration, pynisher_context, time_left, end_at, time_buffer, return_predictions)
+
+    # This is the main wrapper to the EnsembleSelection class which fits the ensemble
+    def main(
+        self, time_left: float, iteration: int, return_predictions: bool,
+    ) -> Tuple[
+        List[Dict[str, float]],
+        int,
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+    ]:
+        """
+        This is the main function of the ensemble builder process and can be considered
+        a wrapper over the ensemble selection method implemented y EnsembleSelection class.
+
+        This method is going to be called multiple times by the main process, to
+        build and ensemble, in case the SMAC process produced new models and to provide
+        anytime results.
+
+        On this regard, this method mainly:
+            1- select from all the individual models that smac created, the N-best candidates
+               (this in the scenario that N > ensemble_nbest argument to this class). This is
+               done based on a score calculated via the metrics argument.
+            2- This pre-selected candidates are provided to the ensemble selection method
+               and if a ensemble is found under the provided memory/time constraints, a new
+               ensemble is proposed.
+            3- Because this process will be called multiple times, it performs checks to make
+               sure a new ensenmble is only proposed if new predictions are available, as well
+               as making sure we do not run out of resources (like disk space)
+
+        Args:
+            time_left (float):
+                How much time is left for the ensemble builder process
+            iteration (int):
+                Which is the current iteration
+            return_predictions (bool):
+                Whether we want to return the predictions of the current model or not
+
+        Returns:
+            ensemble_history (Dict):
+                A snapshot of both test and optimization performance. For debugging.
+            ensemble_nbest (int):
+                The user provides a direction on how many models to use in ensemble selection.
+                This number can be reduced internally if the memory requirements force it.
+            train_predictions (np.ndarray):
+                The optimization prediction from the current ensemble.
+            test_predictions (np.ndarray):
+                The train prediction from the current ensemble.
+        """
+
+        # Pynisher jobs inside dask 'forget'
+        # the logger configuration. So we have to set it up
+        # accordingly
+        self.logger = get_named_client_logger(
+            name='EnsembleBuilder',
+            port=self.logger_port,
+        )
+
+        self.start_time = time.time()
+        train_pred, test_pred = None, None
+
+        used_time = time.time() - self.start_time
+        self.logger.debug(
+            'Starting iteration %d, time left: %f',
+            iteration,
+            time_left - used_time,
+        )
+
+        
+        # self.cutoff_num_run = self._load_ensemble_cutoff_num_run()
+        # # TODO: check how to handle this now.
+        # # checks if we have moved to a new stacking layer.
+        # if self.cutoff_num_run is None or self.is_new_layer:
+        #     # to exclude the latest model we subtract 1 from last available num run
+        #     self.cutoff_num_run = self.backend.get_next_num_run(peek=True) - 1
+        #     self.logger.debug(f"Updated cut off num run to : {self.cutoff_num_run}")
+
+        # populates self.read_preds and self.read_losses with individual model predictions and ensemble loss.
+        if not self.compute_loss_per_model():
+            if return_predictions:
+                return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+            else:
+                return self.ensemble_history, self.ensemble_nbest, None, None
+
+        # Only the models with the n_best predictions are candidates
+        # to be in the ensemble
+        candidate_models = self.get_n_best_preds()
+        if not candidate_models:  # no candidates yet
+            if return_predictions:
+                return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+            else:
+                return self.ensemble_history, self.ensemble_nbest, None, None
+
+        # populates test predictions in self.read_preds
+        # reduces selected models if file reading failed
+        n_sel_test = self.get_test_preds(selected_keys=candidate_models)
+
+        # If any of n_sel_* is not empty and overlaps with candidate_models,
+        # then ensure candidate_models AND n_sel_test are sorted the same
+        candidate_models_set = set(candidate_models)
+        if candidate_models_set.intersection(n_sel_test):
+            candidate_models = sorted(list(candidate_models_set.intersection(
+                n_sel_test)))
+            n_sel_test = candidate_models
+        else:
+            # This has to be the case
+            n_sel_test = []
+
+        if os.environ.get('ENSEMBLE_KEEP_ALL_CANDIDATES'):
+            for candidate in candidate_models:
+                self._has_been_candidate.add(candidate)
+
+        # self.logger.debug(f"for iteration {iteration}, best_model_identifier: {best_model_identifier} \n candidate_models: \n{candidate_models}")
+        # train ensemble
+        ensemble = self.fit_ensemble(selected_keys=candidate_models)
+ 
+        # Save the ensemble for later use in the main module!
+        if ensemble is not None and self.SAVE2DISC:
+            self.backend.save_ensemble(ensemble, iteration + (pow(10, 9))* self.cur_stacking_layer, self.seed)
+            # self._save_ensemble_cutoff_num_run(cutoff_num_run=self.cutoff_num_run)
+        # Delete files of non-candidate models - can only be done after fitting the ensemble and
+        # saving it to disc so we do not accidentally delete models in the previous ensemble
+        if self.max_resident_models is not None:
+            self._delete_excess_models(selected_keys=candidate_models)
+
+        # Save the read losses status for the next iteration
+        with open(self.ensemble_loss_file, "wb") as memory:
+            pickle.dump(self.read_losses, memory)
+
+        if ensemble is not None:
+            train_pred = self.predict(set_="train",
+                                      ensemble=ensemble,
+                                      selected_keys=candidate_models,
+                                      n_preds=len(candidate_models),
+                                      index_run=iteration)
+            # TODO if predictions fails, build the model again during the
+            #  next iteration!
+            test_pred = self.predict(set_="test",
+                                     ensemble=ensemble,
+                                     selected_keys=n_sel_test,
+                                     n_preds=len(candidate_models),
+                                     index_run=iteration)
+
+            # Add a score to run history to see ensemble progress
+            self._add_ensemble_trajectory(
+                train_pred,
+                test_pred
+            )
+
+        # The loaded predictions and the hash can only be saved after the ensemble has been
+        # built, because the hash is computed during the construction of the ensemble
+        with open(self.ensemble_memory_file, "wb") as memory:
+            pickle.dump((self.read_preds, self.last_hash), memory)
+
+        if return_predictions:
+            return self.ensemble_history, self.ensemble_nbest, train_pred, test_pred
+        else:
+            return self.ensemble_history, self.ensemble_nbest, None, None
+
+    def compute_loss_per_model(self) -> bool:
+        """
+            Compute the loss of the predictions on ensemble building data set;
+            populates self.read_preds and self.read_losses
+        """
+
+        self.logger.debug("Read ensemble data set predictions")
+
+        if self.y_true_ensemble is None:
+            try:
+                self.y_true_ensemble = self.backend.load_targets_ensemble()
+            except FileNotFoundError:
+                self.logger.debug(
+                    "Could not find true targets on ensemble data set: %s",
+                    traceback.format_exc(),
+                )
+                return False
+
+        pred_path = os.path.join(
+            glob.escape(self.backend.get_runs_directory()),
+            '%d_*_*' % self.seed,
+            'predictions_ensemble_%s_*_*.npy*' % self.seed,
+        )
+        y_ens_files = glob.glob(pred_path)
+        y_ens_files = [y_ens_file for y_ens_file in y_ens_files
+                       if y_ens_file.endswith('.npy') or y_ens_file.endswith('.npy.gz')]
+        self.y_ens_files = y_ens_files
+        # no validation predictions so far -- no files
+        if len(self.y_ens_files) == 0:
+            self.logger.debug("Found no prediction files on ensemble data set:"
+                              " %s" % pred_path)
+            return False
+
+        # First sort files chronologically
+        to_read = []
+        for y_ens_fn in self.y_ens_files:
+            match = self.model_fn_re.search(y_ens_fn)
+            if match is None:
+                raise ValueError(f"Could not interpret file {y_ens_fn} "
+                                 "Something went wrong while scoring predictions")
+            _seed = int(match.group(1))
+            _num_run = int(match.group(2))
+            _budget = float(match.group(3))
+
+            to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
+
+        n_read_files = 0
+        # Now read file wrt to num_run
+        # Mypy assumes sorted returns an object because of the lambda. Can't get to recognize the list
+        # as a returning list, so as a work-around we skip next line
+        for y_ens_fn, match, _seed, _num_run, _budget in sorted(to_read, key=lambda x: x[3]):  # type: ignore
+            # skip models that were part of previous stacking layer
+            if _num_run < self.initial_num_run:
+                if y_ens_fn in self.read_losses:
+                    del self.read_losses[y_ens_fn]
+                continue
+
+            if self.read_at_most and n_read_files >= self.read_at_most:
+                # limit the number of files that will be read
+                # to limit memory consumption
+                break
+
+            if not y_ens_fn.endswith(".npy") and not y_ens_fn.endswith(".npy.gz"):
+                self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn)
+                continue
+
+            if not self.read_losses.get(y_ens_fn):
+                self.read_losses[y_ens_fn] = {
+                    "ens_loss": np.inf,
+                    "mtime_ens": 0,
+                    "mtime_test": 0,
+                    "seed": _seed,
+                    "num_run": _num_run,
+                    "budget": _budget,
+                    "disc_space_cost_mb": None,
+                    # Lazy keys so far:
+                    # 0 - not loaded
+                    # 1 - loaded and in memory
+                    # 2 - loaded but dropped again
+                    # 3 - deleted from disk due to space constraints
+                    "loaded": 0
+                }
+            if not self.read_preds.get(y_ens_fn):
+                self.read_preds[y_ens_fn] = {
+                    Y_ENSEMBLE: None,
+                    Y_TEST: None,
+                }
+
+            if self.read_losses[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn):
+                # same time stamp; nothing changed;
+                continue
+
+            # actually read the predictions and compute their respective loss
+            try:
+                y_ensemble = self._read_np_fn(y_ens_fn)
+                losses = calculate_loss(
+                    metrics=self.metrics,
+                    target=self.y_true_ensemble,
+                    prediction=y_ensemble,
+                    task_type=self.task_type,
+                )
+
+                if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]):
+                    self.logger.debug(
+                        'Changing ensemble loss for file %s from %f to %f '
+                        'because file modification time changed? %f - %f',
+                        y_ens_fn,
+                        self.read_losses[y_ens_fn]["ens_loss"],
+                        losses[self.opt_metric],
+                        self.read_losses[y_ens_fn]["mtime_ens"],
+                        os.path.getmtime(y_ens_fn),
+                    )
+
+                self.read_losses[y_ens_fn]["ens_loss"] = losses[self.opt_metric]
+
+                # It is not needed to create the object here
+                # To save memory, we just compute the loss.
+                self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn)
+                self.read_losses[y_ens_fn]["loaded"] = 2
+                self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption(
+                    y_ens_fn
+                )
+
+                n_read_files += 1
+
+            except Exception:
+                self.logger.warning(
+                    'Error loading %s: %s',
+                    y_ens_fn,
+                    traceback.format_exc(),
+                )
+                self.read_losses[y_ens_fn]["ens_loss"] = np.inf
+
+        self.logger.debug(
+            'Done reading %d new prediction files. Loaded %d predictions in '
+            'total.',
+            n_read_files,
+            np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()])
+        )
+        return True
+
+    def fit_ensemble(
+        self,
+        selected_keys: List[str]
+    ) -> Optional[EnsembleSelectionPerLayerStackingEnsemble]:
+        """
+            fit ensemble
+
+            Parameters
+            ---------
+            selected_keys: list
+                list of selected keys of self.read_losses
+
+            Returns
+            -------
+            ensemble: StackingEnsemble
+                trained Ensemble
+        """
+
+        if self.unit_test:
+            raise MemoryError()
+
+        predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys]
+        include_num_runs = [
+            (
+                self.read_losses[k]["seed"],
+                self.read_losses[k]["num_run"],
+                self.read_losses[k]["budget"],
+            )
+            for k in selected_keys]
+
+        # check hash if ensemble training data changed
+        current_hash = "".join([
+            str(zlib.adler32(predictions_train[i].data.tobytes()))
+            for i in range(len(predictions_train))
+        ])
+        if self.last_hash == current_hash:
+            self.logger.debug(
+                "No new model predictions selected -- skip ensemble building "
+                "-- current performance: %f",
+                self.validation_performance_,
+            )
+
+            return None
+        self.last_hash = current_hash
+
+        opt_metric = [m for m in self.metrics if m.name == self.opt_metric][0]
+        if not opt_metric:
+            raise ValueError(f"Cannot optimize for {self.opt_metric} in {self.metrics} "
+                             "as more than one unique optimization metric was found.")
+
+
+        cur_ensemble = EnsembleSelection(
+            ensemble_size=self.ensemble_size,
+            metric=opt_metric,
+            random_state=self.random_state,
+            task_type=self.task_type,
+        )
+
+        try:
+            # self.logger.debug(
+            #     "Fitting the ensemble on %d models.",
+            #     len(predictions_train),
+            # )
+
+            start_time = time.time()
+            cur_ensemble.fit(
+                predictions_train,
+                self.y_true_ensemble,
+                include_num_runs,
+            )
+
+            end_time = time.time()
+            self.logger.debug(
+                "Fitting the ensemble took %.2f seconds.",
+                end_time - start_time,
+            )
+            # self.logger.debug(f"weights = {ensemble.weights_}")
+            self.logger.info(str(cur_ensemble))
+            self.validation_performance_ = min(
+                self.validation_performance_,
+                cur_ensemble.get_validation_performance(),
+            )
+            cur_ensemble_model_identifiers = self._get_identifiers_from_num_runs(
+                cur_ensemble.get_selected_model_identifiers()
+                )
+
+            ensemble = EnsembleSelectionPerLayerStackingEnsemble(
+                num_stacking_layers=self.num_stacking_layers,
+                cur_stacking_layer=self.cur_stacking_layer,
+                ensembles=self.ensembles,
+                ensemble_predictions=self.ensemble_predictions
+            )
+            cur_ensemble_predictions_ensemble_set = [self.read_preds[k][Y_ENSEMBLE] for k in cur_ensemble_model_identifiers]
+            cur_ensemble_predictions_test_set = [self.read_preds[k][Y_TEST] for k in cur_ensemble_model_identifiers]
+            ensemble.fit(cur_ensemble=cur_ensemble, cur_ensemble_predictions={
+                'ensemble': cur_ensemble_predictions_ensemble_set,
+                'test': cur_ensemble_predictions_test_set
+            })
+
+        except ValueError:
+            self.logger.error('Caught ValueError: %s', traceback.format_exc())
+            return None
+        except IndexError:
+            self.logger.error('Caught IndexError: %s' + traceback.format_exc())
+            return None
+        finally:
+            # Explicitly free memory
+            del predictions_train
+
+        return ensemble
+
+    def _get_ensemble_identifiers_filename(self, cur_stacking_layer) -> str:
+        return os.path.join(self.backend.internals_directory, f'ensemble_identifiers_{cur_stacking_layer}.pkl')
+
+    def _get_ensemble_cutoff_num_run_filename(self):
+        return os.path.join(self.backend.internals_directory, 'ensemble_cutoff_run.txt')
+
+    def _save_ensemble_cutoff_num_run(self, cutoff_num_run: int) -> None:
+        with open(self._get_ensemble_cutoff_num_run_filename(), "w") as file:
+            file.write(str(cutoff_num_run))
+    
+    def _load_ensemble_cutoff_num_run(self) -> Optional[int]:
+        if os.path.exists(self._get_ensemble_cutoff_num_run_filename()):
+            with open(self._get_ensemble_cutoff_num_run_filename(), "r") as file:
+                cutoff_num_run = int(file.read())
+        else:
+            cutoff_num_run = None
+        return cutoff_num_run
+
+    def _save_current_ensemble_identifiers(self, ensemble_identifiers: List[Optional[str]], cur_stacking_layer) -> None:
+        with open(self._get_ensemble_identifiers_filename(cur_stacking_layer=cur_stacking_layer), "wb") as file:
+            pickle.dump(ensemble_identifiers, file=file)
+    
+    def _load_current_ensemble_identifiers(self, cur_stacking_layer) -> List[Optional[str]]:
+        file_name = self._get_ensemble_identifiers_filename(cur_stacking_layer)
+        if os.path.exists(file_name):
+            with open(file_name, "rb") as file:
+                identifiers = pickle.load(file)
+        else:
+            identifiers = [None]*self.ensemble_size
+        return identifiers
+
+    def _load_stacked_ensemble_identifiers(self) -> List[List[Optional[str]]]:
+        ensemble_identifiers = list()
+        for i in range(self.num_stacking_layers):
+            ensemble_identifiers.append(self._load_current_ensemble_identifiers(cur_stacking_layer=i))
+        return ensemble_identifiers
+
+    def _get_identifiers_from_num_runs(self, num_runs, subset='ensemble') -> List[Optional[str]]:
+        identifiers: List[Optional[str]] = []
+        for num_run in num_runs:
+            identifier = None
+            if num_run is not None:
+                seed, idx, budget = num_run
+                identifier = os.path.join(
+                    self.backend.get_numrun_directory(seed, idx, budget),
+                    self.backend.get_prediction_filename(subset, seed, idx, budget)
+                )
+            identifiers.append(identifier)
+        return identifiers
+
+    def _get_num_runs_from_identifiers(self, identifiers) -> List[Optional[Tuple[int, int, float]]]:
+        num_runs: List[Optional[Tuple[int, int, float]]] = []
+        for identifier in identifiers:
+            num_run = None
+            if identifier is not None:
+                match = self.model_fn_re.search(identifier)
+                if match is None:
+                    raise ValueError(f"Could not interpret file {identifier} "
+                                    "Something went wrong while scoring predictions")
+                _seed = int(match.group(1))
+                _num_run = int(match.group(2))
+                _budget = float(match.group(3))
+                num_run = (_seed, _num_run, _budget)
+            num_runs.append(num_run)
+
+        return num_runs
\ No newline at end of file
diff --git a/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py b/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py
new file mode 100644
index 000000000..78d65220b
--- /dev/null
+++ b/autoPyTorch/ensemble/repeat_models_stacking_ensemble.py
@@ -0,0 +1,179 @@
+from collections import Counter
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
+from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss
+
+
+class RepeatModelsStackingEnsemble(AbstractEnsemble):
+    def __init__(
+        self,
+        base_ensemble: EnsembleSelection
+    ) -> None:
+        self.ensemble_identifiers: Optional[List[List[Tuple[int, int, float]]]] = None
+        self.base_ensemble = base_ensemble
+        self.base_weights = [w for w in base_ensemble.weights_ if w > 0]
+        self.ensemble_weights = None
+
+    def fit(
+        self,
+        identifiers: List[Tuple[int, int, float]],
+    ) -> AbstractEnsemble:
+        """
+        Builds a ensemble given the individual models out of fold predictions.
+        Fundamentally, defines a set of weights on how to perform a soft-voting
+        aggregation of the models in the given identifiers.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of individual model predictions of shape (n_datapoints, n_targets)
+                corresponding to the OutOfFold estimate of the ground truth
+            labels (np.ndarray):
+                The ground truth targets of shape (n_datapoints, n_targets)
+            identifiers: List[Tuple[int, int, float]]
+                A list of model identifiers, each with the form
+                (seed, number of run, budget)
+
+        Returns:
+            A copy of self
+        """
+        self.ensemble_identifiers = identifiers
+        self.ensemble_weights = []
+        for layer_identifiers in identifiers:
+            layer_weights = []
+            for i, identifier in enumerate(layer_identifiers):
+                if identifier is not None:
+                    layer_weights.append(self.base_weights[i])
+            self.ensemble_weights.append(layer_weights)
+        return self
+
+    def _predict(self, predictions, weights):
+        """
+        Given a list of predictions from the individual model, this method
+        aggregates the predictions using a soft voting scheme with the weights
+        found during training.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of predictions from the individual base models.
+
+        Returns:
+            average (np.ndarray): Soft voting predictions of ensemble models, using
+                                the weights found during ensemble selection (self._weights)
+        """
+
+        average = np.zeros_like(predictions[0], dtype=np.float64)
+        tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)
+
+        # if predictions.shape[0] == len(weights),
+        # predictions include those of zero-weight models.
+        if len(predictions) == len(weights):
+            for pred, weight in zip(predictions, weights):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # if prediction model.shape[0] == len(non_null_weights),
+        # predictions do not include those of zero-weight models.
+        elif len(predictions) == np.count_nonzero(weights):
+            non_null_weights = [w for w in weights if w > 0]
+            for pred, weight in zip(predictions, non_null_weights):
+                np.multiply(pred, weight, out=tmp_predictions)
+                np.add(average, tmp_predictions, out=average)
+
+        # If none of the above applies, then something must have gone wrong.
+        else:
+            raise ValueError("The dimensions of ensemble predictions"
+                             " and ensemble weights do not match!")
+        del tmp_predictions
+        return average
+
+    def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
+        """
+        Given a list of predictions from the individual model, this method
+        aggregates the predictions using a soft voting scheme with the weights
+        found during training.
+
+        Args:
+            predictions (List[np.ndarray]):
+                A list of predictions from the individual base models.
+
+        Returns:
+            average (np.ndarray): Soft voting predictions of ensemble models, using
+                                the weights found during ensemble selection (self._weights)
+        """
+
+        return self._predict(predictions=predictions, weights=self.ensemble_weights[-1])
+
+    def __str__(self) -> str:
+        return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
+               '\n\tWeights: %s\n\tIdentifiers: %s' % \
+               (' '.join(['%d: %5f' % (idx, performance)
+                         for idx, performance in enumerate(self.trajectory_)]),
+                self.indices_, self.weights_,
+                ' '.join([str(identifier) for idx, identifier in
+                          enumerate(self.identifiers_)
+                          if self.weights_[idx] > 0]))
+
+    def get_models_with_weights(
+        self,
+        models: Dict[Any, BasePipeline]
+    ) -> List[Tuple[float, BasePipeline]]:
+        """
+        Handy function to tag the provided input models with a given weight.
+
+        Args:
+            models (List[Tuple[float, BasePipeline]]):
+                A dictionary that maps a model's name to it's actual python object.
+
+        Returns:
+            output (List[Tuple[float, BasePipeline]]):
+                each model with the related weight, sorted by ascending
+                performance. Notice that ensemble selection solves a minimization
+                problem.
+        """
+        outputs = []
+        first_layer_models = models[0]
+        for _ in models:
+            outputs.append(self.base_ensemble.get_models_with_weights(first_layer_models))
+
+        return outputs
+
+    def get_expanded_layer_stacking_ensemble_predictions(
+        self,
+        stacking_layer,
+        raw_stacking_layer_ensemble_predictions
+    ) -> List[np.ndarray]:
+        layer_weights = [weight for weight in self.base_ensemble.weights_ if weight > 0]
+        layer_size = self.base_ensemble.ensemble_size
+        ensemble_predictions = []
+        for weight, pred in zip(layer_weights, raw_stacking_layer_ensemble_predictions):
+            ensemble_predictions.extend([pred] * int(weight * layer_size))
+        return ensemble_predictions
+
+    def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
+        """
+        After training of ensemble selection, not all models will be used.
+        Some of them will have zero weight. This procedure filters this models
+        out.
+
+        Returns:
+            output (List[Tuple[int, int, float]]):
+                The models actually used by ensemble selection
+        """
+        return self.ensemble_identifiers
+
+    def get_validation_performance(self) -> float:
+        """
+        Returns the best optimization performance seen during hill climbing
+
+        Returns:
+            (float):
+                best ensemble training performance
+        """
+        return self.base_ensemble.trajectory_[-1]
+
diff --git a/autoPyTorch/ensemble/utils.py b/autoPyTorch/ensemble/utils.py
index 705d17e24..17fe011d0 100644
--- a/autoPyTorch/ensemble/utils.py
+++ b/autoPyTorch/ensemble/utils.py
@@ -1,16 +1,32 @@
 from enum import IntEnum
 
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilder
-from autoPyTorch.ensemble.stacking_ensemble_builder import StackingEnsembleBuilder
+from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble_builder import EnsembleOptimisationStackingEnsembleBuilder
+from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble_builder import EnsembleSelectionPerLayerStackingEnsembleBuilder
 
 
 class EnsembleSelectionTypes(IntEnum):
     ensemble_selection = 1
-    stacking_ensemble = 2
+    stacking_optimisation_ensemble = 2
+    stacking_ensemble_selection_per_layer = 3
+    stacking_repeat_models = 4
+    stacking_autogluon = 5
+
+    def is_stacking_ensemble(self) -> bool:
+        stacked = [self.stacking_optimisation_ensemble,
+                   self.stacking_ensemble_selection_per_layer,
+                   self.stacking_repeat_models,
+                   self.stacking_autogluon]
+        return getattr(self, self.name) in stacked
 
 
 def get_ensemble_builder_class(ensemble_method: int):
-    if ensemble_method == EnsembleSelectionTypes.ensemble_selection:
+    if (
+        ensemble_method == EnsembleSelectionTypes.ensemble_selection
+        or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models
+        ):
         return EnsembleBuilder
-    elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble:
-        return StackingEnsembleBuilder
+    elif ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble:
+        return EnsembleOptimisationStackingEnsembleBuilder
+    elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer:
+        return EnsembleSelectionPerLayerStackingEnsembleBuilder
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 3fcc64889..f19f24b0d 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -83,15 +83,19 @@ def __init__(self, config: str,
         self.init_params = init_params
         self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification. \
             TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties,
-                                                     random_state=self.random_state)
-        configuration_space = self.pipeline.get_hyperparameter_search_space()
-        default_configuration = configuration_space.get_default_configuration().get_dictionary()
-        default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
-        self.configuration = Configuration(configuration_space, default_configuration)
-        self.pipeline.set_hyperparameters(self.configuration)
+                                                     random_state=self.random_state,
+                                                     search_space_updates=self._get_search_space_updates())
+        # configuration_space = self.pipeline.get_hyperparameter_search_space()
+        # default_configuration = configuration_space.get_default_configuration().get_dictionary()
+        # default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
+        # self.configuration = Configuration(configuration_space, default_configuration)
+        # self.pipeline.set_hyperparameters(self.configuration)
+        self.configuration = self.pipeline.config
+        self.is_fitted_ = False
 
     def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
+        self.is_fitted_ = True
         return self.pipeline.fit(X, y)
 
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
@@ -113,12 +117,18 @@ def get_additional_run_info(self) -> Dict[str, Any]:
                     Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
         """
         return {'pipeline_configuration': self.configuration,
-                'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
+                # 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
                 'configuration_origin': 'traditional'}
 
     def get_pipeline_representation(self) -> Dict[str, str]:
         return self.pipeline.get_pipeline_representation()
 
+    def _get_search_space_updates(self):
+        from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+        updates = HyperparameterSearchSpaceUpdates()
+        updates.append(node_name='model_trainer', hyperparameter='traditional_learner', value_range=(self.config,), default_value=self.config)
+        return updates
+
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
         return autoPyTorch.pipeline.traditional_tabular_classification. \
@@ -153,15 +163,19 @@ def __init__(self, config: str,
         self.init_params = init_params
         self.pipeline = autoPyTorch.pipeline.traditional_tabular_regression. \
             TraditionalTabularRegressionPipeline(dataset_properties=dataset_properties,
-                                                 random_state=self.random_state)
-        configuration_space = self.pipeline.get_hyperparameter_search_space()
-        default_configuration = configuration_space.get_default_configuration().get_dictionary()
-        default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
-        self.configuration = Configuration(configuration_space, default_configuration)
-        self.pipeline.set_hyperparameters(self.configuration)
+                                                     random_state=self.random_state,
+                                                     search_space_updates=self._get_search_space_updates())
+        # configuration_space = self.pipeline.get_hyperparameter_search_space()
+        # default_configuration = configuration_space.get_default_configuration().get_dictionary()
+        # default_configuration['model_trainer:tabular_traditional_model:traditional_learner'] = config
+        # self.configuration = Configuration(configuration_space, default_configuration)
+        # self.pipeline.set_hyperparameters(self.configuration)
+        self.configuration = self.pipeline.config
+        self.is_fitted_ = False
 
     def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
+        self.is_fitted_ = True
         return self.pipeline.fit(X, y)
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
@@ -179,15 +193,22 @@ def get_additional_run_info(self) -> Dict[str, Any]:
                     Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
         """
         return {'pipeline_configuration': self.configuration,
-                'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}
+                # 'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
+                'configuration_origin': 'traditional'}
 
     def get_pipeline_representation(self) -> Dict[str, str]:
         return self.pipeline.get_pipeline_representation()
 
+    def _get_search_space_updates(self):
+        from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+        updates = HyperparameterSearchSpaceUpdates()
+        updates.append(node_name='model_trainer', hyperparameter='traditional_learner', value_range=(self.config,), default_value=self.config)
+        return updates
+
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
-        return autoPyTorch.pipeline.traditional_tabular_regression.\
-            TraditionalTabularRegressionPipeline.get_default_pipeline_options()
+        return autoPyTorch.pipeline.traditional_tabular_classification. \
+            TraditionalTabularClassificationPipeline.get_default_pipeline_options()
 
 
 class DummyClassificationPipeline(DummyClassifier):
@@ -216,9 +237,11 @@ def __init__(self, config: Configuration,
             super(DummyClassificationPipeline, self).__init__(strategy="uniform")
         else:
             super(DummyClassificationPipeline, self).__init__(strategy="most_frequent")
+        self.is_fitted_ = False
 
     def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
+        self.is_fitted_ = True
         X_train = subsampler(X['X_train'], X['train_indices'])
         y_train = subsampler(X['y_train'], X['train_indices'])
         return super(DummyClassificationPipeline, self).fit(np.ones((X_train.shape[0], 1)), y_train,
@@ -278,9 +301,11 @@ def __init__(self, config: Configuration,
             super(DummyRegressionPipeline, self).__init__(strategy='mean')
         else:
             super(DummyRegressionPipeline, self).__init__(strategy='median')
+        self.is_fitted_ = False
 
     def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
+        self.is_fitted_ = True
         X_train = subsampler(X['X_train'], X['train_indices'])
         y_train = subsampler(X['y_train'], X['train_indices'])
         return super(DummyRegressionPipeline, self).fit(np.ones((X_train.shape[0], 1)), y_train,
@@ -425,7 +450,8 @@ def __init__(self, backend: Backend,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-                 use_ensemble_opt_loss=False
+                 use_ensemble_opt_loss=False,
+                 cur_stacking_layer: int = 0
                  ) -> None:
 
         self.starttime = time.time()
@@ -499,7 +525,7 @@ def __init__(self, backend: Backend,
             else self.pipeline_class.get_default_pipeline_options()
         self.budget_type = pipeline_config['budget_type'] if budget_type is None else budget_type
         self.budget = pipeline_config[self.budget_type] if budget == 0 else budget
-
+        self.cutoff = pipeline_config['func_eval_time_limit_secs'] * 0.9
         self.num_run = 0 if num_run is None else num_run
 
         logger_name = '%s(%d)' % (self.__class__.__name__.split('.')[-1],
@@ -768,6 +794,9 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         if test_loss is not None:
             additional_run_info['test_loss'] = test_loss
 
+        additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary()
+        additional_run_info['budget'] = self.budget
+
         rval_dict = {'loss': cost,
                      'additional_run_info': additional_run_info,
                      'status': status}
diff --git a/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py b/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py
new file mode 100644
index 000000000..569068154
--- /dev/null
+++ b/autoPyTorch/evaluation/ensemble_optimisation_evaluator.py
@@ -0,0 +1,648 @@
+from math import floor
+from multiprocessing.queues import Queue
+import os
+import time
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.ensemble import VotingClassifier
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    MULTICLASSMULTIOUTPUT,
+)
+from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, RepeatedCrossValTypes
+from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble_builder import calculate_nomalised_margin_loss
+from autoPyTorch.evaluation.abstract_evaluator import (
+    AbstractEvaluator,
+    fit_and_suppress_warnings
+)
+from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble
+from autoPyTorch.evaluation.utils import VotingRegressorWrapper, check_pipeline_is_fitted
+from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.common import dict_repr, subsampler
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+__all__ = ['EnsembleOptimisationEvaluator', 'eval_ensemble_optimise_function']
+
+
+def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray:
+    if task_type in CLASSIFICATION_TASKS and task_type != \
+            MULTICLASSMULTIOUTPUT:
+        return y.ravel()
+    else:
+        return y
+
+
+class EnsembleOptimisationEvaluator(AbstractEvaluator):
+    """
+    This class builds a pipeline using the provided configuration.
+    A pipeline implementing the provided configuration is fitted
+    using the datamanager object retrieved from disc, via the backend.
+    After the pipeline is fitted, it is save to disc and the performance estimate
+    is communicated to the main process via a Queue.
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        configuration (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed. A dummy estimator is created for
+            integer configurations, a traditional machine learning pipeline is created
+            for string based configuration, and NAS is performed when a configuration
+            object is passed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        all_supported_metrics  (bool):
+            Whether all supported metric should be calculated for every configuration.
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            An object used to fine tune the hyperparameter search space of the pipeline
+    """
+    def __init__(self, backend: Backend, queue: Queue,
+                 metric: autoPyTorchMetric,
+                 budget: float,
+                 configuration: Union[int, str, Configuration],
+                 budget_type: str = None,
+                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 seed: int = 1,
+                 output_y_hat_optimization: bool = True,
+                 num_run: Optional[int] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 disable_file_output: Union[bool, List] = False,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 logger_port: Optional[int] = None,
+                 all_supported_metrics: bool = True,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 use_ensemble_opt_loss=False,
+                 cur_stacking_layer: int = 0) -> None:
+        super().__init__(
+            backend=backend,
+            queue=queue,
+            configuration=configuration,
+            metric=metric,
+            seed=seed,
+            output_y_hat_optimization=output_y_hat_optimization,
+            num_run=num_run,
+            include=include,
+            exclude=exclude,
+            disable_file_output=disable_file_output,
+            init_params=init_params,
+            budget=budget,
+            budget_type=budget_type,
+            logger_port=logger_port,
+            all_supported_metrics=all_supported_metrics,
+            pipeline_config=pipeline_config,
+            search_space_updates=search_space_updates,
+            use_ensemble_opt_loss=use_ensemble_opt_loss
+        )
+
+        self.cur_stacking_layer = cur_stacking_layer
+        self.num_repeats = len(self.splits)
+        self.num_folds = len(self.splits[0])
+        self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss))
+        self.old_ensemble: Optional[EnsembleOptimisationStackingEnsemble] = None
+        ensemble_dir = self.backend.get_ensemble_dir()
+        if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1:
+            self.old_ensemble = self.backend.load_ensemble(self.seed)
+            assert isinstance(self.old_ensemble, EnsembleOptimisationStackingEnsemble)
+
+        self.logger.debug(f"for num run: {num_run}, X_train.shape: {self.X_train.shape} and X_test.shape: {self.X_test.shape}")
+
+    def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
+                  valid_pred: Optional[np.ndarray],
+                  test_pred: Optional[np.ndarray],
+                  pipeline_opt_pred: np.ndarray,
+                  ensemble_opt_pred: np.ndarray,
+                  additional_run_info: Optional[Dict],
+                  file_output: bool, status: StatusType,   
+                  ) -> Optional[Tuple[float, float, int, Dict]]:
+        """This function does everything necessary after the fitting is done:
+        * predicting
+        * saving the necessary files
+        We use it as the signal handler so we can recycle the code for the
+        normal usecase and when the runsolver kills us here :)"""
+
+        self.duration = time.time() - self.starttime
+
+        if file_output:
+            loss_, additional_run_info_ = self.file_output(
+                pipeline_opt_pred, valid_pred, test_pred
+            )
+        else:
+            loss_ = None
+            additional_run_info_ = {}
+
+        validation_loss, test_loss = self.calculate_auxiliary_losses(
+            valid_pred, test_pred
+        )
+
+        pipeline_loss, _ = self.calculate_auxiliary_losses(
+            pipeline_opt_pred, None
+        )
+
+        if loss_ is not None:
+            return self.duration, loss_, self.seed, additional_run_info_
+
+        cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name]
+
+        additional_run_info = (
+            {} if additional_run_info is None else additional_run_info
+        )
+        for metric_name, value in loss.items():
+            additional_run_info[metric_name] = value
+        additional_run_info['duration'] = self.duration
+        additional_run_info['num_run'] = self.num_run
+        if pipeline_loss is not None:
+            additional_run_info['pipeline_loss'] = pipeline_loss
+        if train_loss is not None:
+            additional_run_info['train_loss'] = train_loss
+        if validation_loss is not None:
+            additional_run_info['validation_loss'] = validation_loss
+        if test_loss is not None:
+            additional_run_info['test_loss'] = test_loss
+        additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary()
+        additional_run_info['budget'] = self.budget
+
+        additional_run_info['opt_loss'] = loss
+        rval_dict = {'loss': cost,
+                     'additional_run_info': additional_run_info,
+                     'status': status}
+
+        self.queue.put(rval_dict)
+        return None
+
+    def get_sorted_preds(self, preds: List[List[np.ndarray]], repeat_id: int) -> np.ndarray:
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([test_indices for _, test_indices in self.splits[repeat_id]])
+        zipped_lists = zip(indices, predictions)
+
+        sorted_zipped_lists = sorted(zipped_lists)
+        predictions = [pred for _, pred in sorted_zipped_lists]
+        return predictions
+
+    def get_sorted_train_preds(self, preds: List[List[np.ndarray]], repeat_id: int):
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]])
+
+        unique_indices = set(indices)
+        sorted_predictions = np.zeros((len(unique_indices), self.num_classes))
+
+        for i in unique_indices:
+            positions = np.where(indices == i)
+            tmp = list()
+            for position in positions:
+                tmp.append(predictions[position])
+            mean_tmp = np.squeeze(np.mean(tmp, axis=1))
+            for j, mean in enumerate(mean_tmp):
+                sorted_predictions[i][j] = mean
+        return sorted_predictions
+
+    def get_sorted_train_targets(self, preds: List[List[np.ndarray]], repeat_id: int):
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]])
+
+        unique_indices = set(indices)
+        sorted_predictions = np.zeros(len(unique_indices))
+
+        for i in unique_indices:
+            positions = np.where(indices == i)
+            tmp = list()
+            for position in positions:
+                tmp.append(predictions[position])
+            mean_tmp = np.squeeze(np.mean(tmp, axis=1))
+            sorted_predictions[i] = mean_tmp
+        return sorted_predictions
+
+    def file_output(
+        self,
+        Y_optimization_pred: np.ndarray,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray,
+    ) -> Tuple[Optional[float], Dict]:
+
+        # Abort in case of shape misalignment
+        if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]:
+            return (
+                1.0,
+                {
+                    'error':
+                        "Targets %s and prediction %s don't have "
+                        "the same length. Probably training didn't "
+                        "finish" % (self.Y_optimization.shape, Y_optimization_pred.shape)
+                },
+            )
+
+        # Abort if predictions contain NaNs
+        for y, s in [
+            # Y_train_pred deleted here. Fix unittest accordingly.
+            [Y_optimization_pred, 'optimization'],
+            [Y_valid_pred, 'validation'],
+            [Y_test_pred, 'test']
+        ]:
+            if y is not None and not np.all(np.isfinite(y)):
+                return (
+                    1.0,
+                    {
+                        'error':
+                            'Model predictions for %s set contains NaNs.' % s
+                    },
+                )
+
+        # Abort if we don't want to output anything.
+        if hasattr(self, 'disable_file_output'):
+            if self.disable_file_output:
+                return None, {}
+            else:
+                self.disabled_file_outputs = []
+
+        # This file can be written independently of the others down bellow
+        if 'y_optimization' not in self.disabled_file_outputs:
+            if self.output_y_hat_optimization:
+                self.backend.save_targets_ensemble(self.Y_optimization)
+
+        if hasattr(self, 'pipelines') and self.pipelines is not None and isinstance(self.resampling_strategy, RepeatedCrossValTypes):
+            if self.pipelines[0] is not None and len(self.pipelines) > 0:
+                if 'pipelines' not in self.disabled_file_outputs:
+                    if self.task_type in CLASSIFICATION_TASKS:
+                        pipelines = VotingClassifier(estimators=None, voting='soft', )
+                    else:
+                        pipelines = VotingRegressorWrapper(estimators=None)
+                    pipelines.estimators_ = [pipeline for repeat_pipelines in self.pipelines for pipeline in repeat_pipelines if check_pipeline_is_fitted(pipeline, self.configuration)]
+                else:
+                    pipelines = None
+            else:
+                pipelines = None
+        else:
+            pipelines = None
+
+        if hasattr(self, 'pipeline') and self.pipeline is not None and isinstance(self.resampling_strategy, HoldoutValTypes):
+            if 'pipeline' not in self.disabled_file_outputs:
+                pipeline = self.pipeline
+            else:
+                pipeline = None
+        else:
+            # need a pipeline to get representation of the model.
+            # see https://github.com/automl/Auto-PyTorch/blob/master/autoPyTorch/api/base_task.py#L467
+            pipeline = self.pipelines[-1][-1]
+
+        self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget))
+        self.backend.save_numrun_to_dir(
+            seed=int(self.seed),
+            idx=int(self.num_run),
+            budget=float(self.budget),
+            model=pipeline,
+            cv_model=pipelines,
+            ensemble_predictions=(
+                Y_optimization_pred if 'y_optimization' not in
+                                       self.disabled_file_outputs else None
+            ),
+            valid_predictions=(
+                Y_valid_pred if 'y_valid' not in
+                                self.disabled_file_outputs else None
+            ),
+            test_predictions=(
+                Y_test_pred if 'y_test' not in
+                               self.disabled_file_outputs else None
+            ),
+        )
+
+        return None, {}
+
+    def fit_predict_and_loss(self) -> None:
+        """Fit, predict and compute the loss for cross-validation and
+        holdout"""
+        assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
+            .format(self.__class__.__name__)
+
+        Y_train_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_pipeline_optimization_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_valid_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_test_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        # Y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+        # Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+
+        self.pipelines = [[self._get_pipeline() for _ in range(self.num_folds)] for _ in range(self.num_repeats)]
+
+        additional_run_info = {}
+
+        total_repeats = self.num_repeats
+        for repeat_id, folds in enumerate(self.splits):
+            if repeat_id >= total_repeats:
+                break
+            y_train_pred_folds = [None] * self.num_folds
+            y_pipeline_optimization_pred_folds = [None] * self.num_folds
+            y_valid_pred_folds = [None] * self.num_folds
+            y_test_pred_folds = [None] * self.num_folds
+            # y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+            # y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+
+            for i, (train_split, test_split) in enumerate(folds):
+                starttime = time.time()
+                self.logger.info(f"Starting fit for repeat: {repeat_id} and fold: {i}")
+                pipeline = self.pipelines[repeat_id][i]
+                (
+                    y_train_pred,
+                    y_pipeline_opt_pred,
+                    y_valid_pred,
+                    y_test_pred,
+                ) = self._fit_and_predict(pipeline, i, repeat_id,
+                                        train_indices=train_split,
+                                        test_indices=test_split)
+                y_train_pred_folds[i] = y_train_pred
+                y_pipeline_optimization_pred_folds[i] = y_pipeline_opt_pred
+                if y_valid_pred is not None:
+                    y_valid_pred_folds[i] = y_valid_pred
+                if y_test_pred is not None:
+                    y_test_pred_folds[i] = y_test_pred
+
+                # y_train_targets[i] = self.y_train[train_split]
+                # y_targets[i] = self.y_train[test_split]
+                
+                additional_run_info.update(pipeline.get_additional_run_info() if hasattr(
+                    pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {})
+                duration_fit_single = time.time() - starttime
+                if repeat_id == 0 and i == 0:
+                    expected_num_folds = floor(self.cutoff/(1.15*duration_fit_single))
+                    self.logger.debug(f"cutoff :{self.cutoff}, expected num folds: {expected_num_folds}, duration_fit_single: {duration_fit_single}")
+                    expected_total_repeats = floor(expected_num_folds/self.num_folds)
+                    if expected_total_repeats < total_repeats:
+                        self.logger.debug(f"For num_run: {self.num_run}, expected repeats of cross validation: {expected_total_repeats} "
+                                          f"is less than the given value: {total_repeats}. Will only run for {expected_total_repeats}")
+                        total_repeats = expected_total_repeats
+                        if total_repeats <= repeat_id:
+                            raise ValueError("Not expected to complete first repeat, terminating configuration")
+
+            Y_train_pred[repeat_id] = self.get_sorted_train_preds(y_train_pred_folds, repeat_id)
+            Y_pipeline_optimization_pred[repeat_id] = self.get_sorted_preds(y_pipeline_optimization_pred_folds, repeat_id)
+            if self.X_valid is not None:
+                Y_valid_pred[repeat_id] = np.array([y_valid_pred_folds[i] for i in range(self.num_folds) if y_valid_pred_folds[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_valid_pred[repeat_id].shape) == 3:
+                    Y_valid_pred[repeat_id] = np.nanmean(Y_valid_pred[repeat_id], axis=0)
+            else:
+                Y_valid_pred = None
+
+            if self.X_test is not None:
+                Y_test_pred[repeat_id] = np.array([y_test_pred_folds[i] for i in range(self.num_folds) if y_test_pred_folds[i] is not None])
+                # Average the predictions of several pipelines of the folds
+                if len(Y_test_pred[repeat_id].shape) == 3:
+                    Y_test_pred[repeat_id] = np.nanmean(Y_test_pred[repeat_id], axis=0)
+            else:
+                Y_test_pred = None
+
+        # # as targets do change within repeats
+        # Y_targets = self.y_train.copy() # self.get_sorted_preds(y_targets, -1)
+        # Y_train_targets = self.y_train.copy() # self.get_sorted_train_targets(y_train_targets, -1)
+
+        # Average prediction values accross repeats
+        Y_train_pred = np.nanmean(Y_train_pred[:total_repeats], axis=0)
+        Y_pipeline_optimization_pred = np.nanmean(Y_pipeline_optimization_pred[:total_repeats], axis=0)
+        Y_valid_pred = np.nanmean(Y_valid_pred[:total_repeats], axis=0) if Y_valid_pred is not None else None
+        Y_test_pred = np.nanmean(Y_test_pred[:total_repeats], axis=0) if Y_test_pred is not None else None
+
+        if self.old_ensemble is not None:
+            Y_ensemble_optimization_pred = self.old_ensemble.predict_with_current_pipeline(Y_pipeline_optimization_pred)
+            Y_ensemble_preds = self.old_ensemble.get_ensemble_predictions_with_current_pipeline(Y_pipeline_optimization_pred)
+        else:
+            Y_ensemble_optimization_pred = Y_pipeline_optimization_pred.copy()
+            Y_ensemble_preds = [Y_pipeline_optimization_pred]
+
+        self.Y_optimization = self.y_train # np.array(Y_targets)
+        self.Y_actual_train = self.y_train # np.array(Y_train_targets)
+
+        self.pipeline = self._get_pipeline()
+
+        train_loss = self._loss(self.Y_actual_train, Y_train_pred)
+        opt_loss = self._loss(self.Y_optimization, Y_ensemble_optimization_pred)
+
+        opt_loss ['ensemble_opt_loss'] = calculate_nomalised_margin_loss(Y_ensemble_preds, self.Y_optimization)
+        status = StatusType.SUCCESS
+        self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format(
+            self.num_run,
+            opt_loss
+        ))
+        self.finish_up(
+            loss=opt_loss,
+            train_loss=train_loss,
+            ensemble_opt_pred=Y_ensemble_optimization_pred,
+            valid_pred=Y_valid_pred,
+            test_pred=Y_test_pred,
+            additional_run_info=additional_run_info,
+            file_output=True,
+            status=status,
+            pipeline_opt_pred=Y_pipeline_optimization_pred
+        )
+
+    def _fit_and_predict(
+        self,
+        pipeline: BaseEstimator,
+        fold: int,
+        repeat_id: int,
+        train_indices: Union[np.ndarray, List],
+        test_indices: Union[np.ndarray, List],
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
+
+        # See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
+        # about fit_dictionary
+        X = {'train_indices': train_indices,
+             'val_indices': test_indices,
+             'split_id': fold,
+             'repeat_id': repeat_id,
+             'num_run': self.num_run,
+             **self.fit_dictionary}  # fit dictionary
+        y = None
+        fit_and_suppress_warnings(self.logger, pipeline, X, y)
+        self.logger.info("Model fitted, now predicting")
+        Y_train_pred, Y_pipeline_opt_pred, Y_valid_pred, Y_test_pred = self._predict(
+            pipeline,
+            train_indices=train_indices,
+            test_indices=test_indices,
+        )
+
+        self.pipeline = pipeline
+
+        return Y_train_pred, Y_pipeline_opt_pred, Y_valid_pred, Y_test_pred
+
+    def _predict(
+        self,
+        pipeline: BaseEstimator,
+        test_indices: Union[np.ndarray, List],
+        train_indices: Union[np.ndarray, List]
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
+        train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
+                                           self.y_train[train_indices])
+
+        pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
+                                         self.y_train[train_indices])
+
+        # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
+        if self.X_valid is not None:
+            valid_pred = self.predict_function(self.X_valid, pipeline,
+                                               self.y_valid)
+        else:
+            valid_pred = None
+
+        if self.X_test is not None:
+            test_pred = self.predict_function(self.X_test, pipeline,
+                                              self.y_train[train_indices])
+        else:
+            test_pred = None
+
+        return train_pred, pipeline_opt_pred, valid_pred, test_pred
+
+
+# create closure for evaluating an algorithm
+def eval_ensemble_optimise_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Union[bool, List],
+    output_y_hat_optimization: bool,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    use_ensemble_opt_loss=False,
+    cur_stacking_layer: int = 0,
+    instance: str = None,
+) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+    """
+    evaluator = EnsembleOptimisationEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates,
+        use_ensemble_opt_loss=use_ensemble_opt_loss,
+        cur_stacking_layer=cur_stacking_layer
+    )
+    evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/stacking_evaluator.py b/autoPyTorch/evaluation/repeated_crossval_evaluator.py
similarity index 62%
rename from autoPyTorch/evaluation/stacking_evaluator.py
rename to autoPyTorch/evaluation/repeated_crossval_evaluator.py
index 4207e234f..c71be49a9 100644
--- a/autoPyTorch/evaluation/stacking_evaluator.py
+++ b/autoPyTorch/evaluation/repeated_crossval_evaluator.py
@@ -1,6 +1,10 @@
+from math import floor
 from multiprocessing.queues import Queue
+from optparse import Option
 import os
+import re
 import time
+from timeit import repeat
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
@@ -8,6 +12,7 @@
 import numpy as np
 
 from sklearn.base import BaseEstimator
+from sklearn.ensemble import VotingClassifier
 
 from smac.tae import StatusType
 
@@ -16,17 +21,17 @@
     CLASSIFICATION_TASKS,
     MULTICLASSMULTIOUTPUT,
 )
-from autoPyTorch.ensemble.stacking_ensemble_builder import calculate_nomalised_margin_loss
+from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, RepeatedCrossValTypes
 from autoPyTorch.evaluation.abstract_evaluator import (
     AbstractEvaluator,
     fit_and_suppress_warnings
 )
-from autoPyTorch.ensemble.stacking_ensemble import StackingEnsemble
+from autoPyTorch.evaluation.utils import VotingRegressorWrapper, check_pipeline_is_fitted
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.utils.common import dict_repr, subsampler
+from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
-__all__ = ['StackingEvaluator', 'eval_function']
+__all__ = ['RepeatedCrossValEvaluator', 'eval_repeated_cv_function']
 
 
 def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray:
@@ -37,7 +42,7 @@ def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray:
         return y
 
 
-class StackingEvaluator(AbstractEvaluator):
+class RepeatedCrossValEvaluator(AbstractEvaluator):
     """
     This class builds a pipeline using the provided configuration.
     A pipeline implementing the provided configuration is fitted
@@ -116,7 +121,9 @@ def __init__(self, backend: Backend, queue: Queue,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-                 use_ensemble_opt_loss=False) -> None:
+                 use_ensemble_opt_loss=False,
+                 cur_stacking_layer: int = 0
+                 ) -> None:
         super().__init__(
             backend=backend,
             queue=queue,
@@ -135,20 +142,23 @@ def __init__(self, backend: Backend, queue: Queue,
             all_supported_metrics=all_supported_metrics,
             pipeline_config=pipeline_config,
             search_space_updates=search_space_updates,
-            use_ensemble_opt_loss=use_ensemble_opt_loss
+            use_ensemble_opt_loss=use_ensemble_opt_loss,
         )
 
-        self.logger.debug("use_ensemble_loss :{}".format(self.use_ensemble_opt_loss))
+        self.num_repeats = len(self.splits)
+        self.num_folds = len(self.splits[0])
+
+        self.logger.debug(f"for num run: {num_run}, X_train.shape: {self.X_train.shape} and X_test.shape: {self.X_test.shape}")
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   valid_pred: Optional[np.ndarray],
                   test_pred: Optional[np.ndarray],
-                  pipeline_opt_pred: np.ndarray,
-                  ensemble_opt_pred: np.ndarray,
+                  opt_pred: np.ndarray,
                   additional_run_info: Optional[Dict],
-                  file_output: bool, status: StatusType,   
+                  file_output: bool, status: StatusType,                  
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
+
         * predicting
         * saving the necessary files
         We use it as the signal handler so we can recycle the code for the
@@ -158,7 +168,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
 
         if file_output:
             loss_, additional_run_info_ = self.file_output(
-                pipeline_opt_pred, valid_pred, test_pred
+                opt_pred, valid_pred, test_pred
             )
         else:
             loss_ = None
@@ -169,13 +179,12 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         )
 
         pipeline_loss, _ = self.calculate_auxiliary_losses(
-            pipeline_opt_pred, None
+            opt_pred, None
         )
-
         if loss_ is not None:
             return self.duration, loss_, self.seed, additional_run_info_
 
-        cost = loss["ensemble_opt_loss"] if self.use_ensemble_opt_loss else loss[self.metric.name]
+        cost = loss[self.metric.name]
 
         additional_run_info = (
             {} if additional_run_info is None else additional_run_info
@@ -184,16 +193,18 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             additional_run_info[metric_name] = value
         additional_run_info['duration'] = self.duration
         additional_run_info['num_run'] = self.num_run
-        if pipeline_loss is not None:
-            additional_run_info['pipeline_loss'] = pipeline_loss
         if train_loss is not None:
             additional_run_info['train_loss'] = train_loss
         if validation_loss is not None:
             additional_run_info['validation_loss'] = validation_loss
         if test_loss is not None:
             additional_run_info['test_loss'] = test_loss
-
+        if pipeline_loss is not None:
+            additional_run_info['pipeline_loss'] = pipeline_loss
         additional_run_info['opt_loss'] = loss
+        additional_run_info['configuration'] = self.configuration if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary()
+        additional_run_info['budget'] = self.budget
+
         rval_dict = {'loss': cost,
                      'additional_run_info': additional_run_info,
                      'status': status}
@@ -201,6 +212,48 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         self.queue.put(rval_dict)
         return None
 
+    def get_sorted_preds(self, preds: List[List[np.ndarray]], repeat_id: int) -> np.ndarray:
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([test_indices for _, test_indices in self.splits[repeat_id]])
+        zipped_lists = zip(indices, predictions)
+
+        sorted_zipped_lists = sorted(zipped_lists)
+        predictions = [pred for _, pred in sorted_zipped_lists]
+        return predictions
+
+    def get_sorted_train_preds(self, preds: List[List[np.ndarray]], repeat_id: int):
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]])
+
+        unique_indices = set(indices)
+        sorted_predictions = np.zeros((len(unique_indices), self.num_classes))
+
+        for i in unique_indices:
+            positions = np.where(indices == i)
+            tmp = list()
+            for position in positions:
+                tmp.append(predictions[position])
+            mean_tmp = np.squeeze(np.mean(tmp, axis=1))
+            for j, mean in enumerate(mean_tmp):
+                sorted_predictions[i][j] = mean
+        return sorted_predictions
+
+    def get_sorted_train_targets(self, preds: List[List[np.ndarray]], repeat_id: int):
+        predictions = np.concatenate([pred for pred in preds if pred is not None])
+        indices = np.concatenate([train_indices for train_indices, _ in self.splits[repeat_id]])
+
+        unique_indices = set(indices)
+        sorted_predictions = np.zeros(len(unique_indices))
+
+        for i in unique_indices:
+            positions = np.where(indices == i)
+            tmp = list()
+            for position in positions:
+                tmp.append(predictions[position])
+            mean_tmp = np.squeeze(np.mean(tmp, axis=1))
+            sorted_predictions[i] = mean_tmp
+        return sorted_predictions
+
     def file_output(
         self,
         Y_optimization_pred: np.ndarray,
@@ -225,7 +278,7 @@ def file_output(
             # Y_train_pred deleted here. Fix unittest accordingly.
             [Y_optimization_pred, 'optimization'],
             [Y_valid_pred, 'validation'],
-            [Y_test_pred, 'test']
+            [Y_test_pred, 'test'],
         ]:
             if y is not None and not np.all(np.isfinite(y)):
                 return (
@@ -248,13 +301,30 @@ def file_output(
             if self.output_y_hat_optimization:
                 self.backend.save_targets_ensemble(self.Y_optimization)
 
-        if hasattr(self, 'pipeline') and self.pipeline is not None:
+        if hasattr(self, 'pipelines') and self.pipelines is not None and isinstance(self.resampling_strategy, RepeatedCrossValTypes):
+            if self.pipelines[0] is not None and len(self.pipelines) > 0:
+                if 'pipelines' not in self.disabled_file_outputs:
+                    if self.task_type in CLASSIFICATION_TASKS:
+                        pipelines = VotingClassifier(estimators=None, voting='soft', )
+                    else:
+                        pipelines = VotingRegressorWrapper(estimators=None)
+                    pipelines.estimators_ = [pipeline for repeat_pipelines in self.pipelines for pipeline in repeat_pipelines if check_pipeline_is_fitted(pipeline, self.configuration)]
+                else:
+                    pipelines = None
+            else:
+                pipelines = None
+        else:
+            pipelines = None
+
+        if hasattr(self, 'pipeline') and self.pipeline is not None and isinstance(self.resampling_strategy, HoldoutValTypes):
             if 'pipeline' not in self.disabled_file_outputs:
                 pipeline = self.pipeline
             else:
                 pipeline = None
         else:
-            pipeline = None
+            # need a pipeline to get representation of the model.
+            # see https://github.com/automl/Auto-PyTorch/blob/master/autoPyTorch/api/base_task.py#L467
+            pipeline = self.pipelines[-1][-1]
 
         self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget))
         self.backend.save_numrun_to_dir(
@@ -262,7 +332,7 @@ def file_output(
             idx=int(self.num_run),
             budget=float(self.budget),
             model=pipeline,
-            cv_model=None,
+            cv_model=pipelines,
             ensemble_predictions=(
                 Y_optimization_pred if 'y_optimization' not in
                                        self.disabled_file_outputs else None
@@ -284,57 +354,124 @@ def fit_predict_and_loss(self) -> None:
         holdout"""
         assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
             .format(self.__class__.__name__)
-        additional_run_info: Optional[Dict] = None
-        split_id = 0
-        self.logger.info("Starting fit {}".format(split_id))
 
-        pipeline = self._get_pipeline()
+        Y_train_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_optimization_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_valid_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        Y_test_pred: List[List[Optional[np.ndarray]]] = [None] * self.num_repeats
+        # Y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+        # Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+
+
+        self.pipelines = [[self._get_pipeline() for _ in range(self.num_folds)] for _ in range(self.num_repeats)]
+
+        additional_run_info = {}
+
+        total_repeats = self.num_repeats
+        for repeat_id, folds in enumerate(self.splits):
+            if repeat_id >= total_repeats:
+                break
+
+            y_train_pred_folds = [None] * self.num_folds
+            y_optimization_pred_folds = [None] * self.num_folds
+            y_valid_pred_folds = [None] * self.num_folds
+            y_test_pred_folds = [None] * self.num_folds
+            # y_train_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+            # y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+
+            for i, (train_split, test_split) in enumerate(folds):
+                starttime = time.time()
+                self.logger.info(f"Starting fit for repeat: {repeat_id} and fold: {i}")
+                pipeline = self.pipelines[repeat_id][i]
+                (
+                    y_train_pred,
+                    y_opt_pred,
+                    y_valid_pred,
+                    y_test_pred,
+                ) = self._fit_and_predict(pipeline, i, repeat_id,
+                                        train_indices=train_split,
+                                        test_indices=test_split)
+                y_train_pred_folds[i] = y_train_pred
+                y_optimization_pred_folds[i] = y_opt_pred
+                if y_valid_pred is not None:
+                    y_valid_pred_folds[i] = y_valid_pred
+                if y_test_pred is not None:
+                    y_test_pred_folds[i] = y_test_pred
+
+                # y_train_targets[i] = self.y_train[train_split]
+                # y_targets[i] = self.y_train[test_split]
+                
+                additional_run_info.update(pipeline.get_additional_run_info() if hasattr(
+                    pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {})
+                duration_fit_single = time.time() - starttime
+                if repeat_id == 0 and i == 0:
+                    expected_num_folds = floor(self.cutoff/(1.15*duration_fit_single))
+                    self.logger.debug(f"cutoff :{self.cutoff}, expected num folds: {expected_num_folds}, duration_fit_single: {duration_fit_single}")
+                    expected_total_repeats = floor(expected_num_folds/self.num_folds)
+                    if expected_total_repeats < total_repeats:
+                        self.logger.debug(f"For num_run: {self.num_run}, expected repeats of cross validation: {expected_total_repeats} "
+                                          f"is less than the given value: {total_repeats}. Will only run for {expected_total_repeats}")
+                        total_repeats = expected_total_repeats
+                        if total_repeats <= repeat_id:
+                            raise ValueError("Not expected to complete first repeat, terminating configuration")
+
+            Y_train_pred[repeat_id] = self.get_sorted_train_preds(y_train_pred_folds, repeat_id)
+            Y_optimization_pred[repeat_id] = self.get_sorted_preds(y_optimization_pred_folds, repeat_id)
+            if self.X_valid is not None:
+                Y_valid_pred[repeat_id] = np.array([y_valid_pred_folds[i] for i in range(self.num_folds) if y_valid_pred_folds[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_valid_pred[repeat_id].shape) == 3:
+                    Y_valid_pred[repeat_id] = np.nanmean(Y_valid_pred[repeat_id], axis=0)
+            else:
+                Y_valid_pred = None
 
-        train_split, test_split = self.splits[split_id]
-        self.Y_optimization = self.y_train[test_split]
-        self.Y_actual_train = self.y_train[train_split]
-        (
-            y_train_pred,
-            y_pipeline_opt_pred,
-            y_ensemble_opt_pred,
-            y_valid_pred,
-            y_test_pred,
-            y_ensemble_preds
-        ) = self._fit_and_predict(pipeline, split_id,
-                                  train_indices=train_split,
-                                  test_indices=test_split)
-
-        train_loss = self._loss(self.y_train[train_split], y_train_pred)
-        loss = self._loss(self.y_train[test_split], y_ensemble_opt_pred)
-
-        loss['ensemble_opt_loss'] = calculate_nomalised_margin_loss(y_ensemble_preds, self.y_train[test_split], self.task_type)
-        additional_run_info = pipeline.get_additional_run_info() if hasattr(
-            pipeline, 'get_additional_run_info') else {}
+            if self.X_test is not None:
+                Y_test_pred[repeat_id] = np.array([y_test_pred_folds[i] for i in range(self.num_folds) if y_test_pred_folds[i] is not None])
+                # Average the predictions of several pipelines of the folds
+                if len(Y_test_pred[repeat_id].shape) == 3:
+                    Y_test_pred[repeat_id] = np.nanmean(Y_test_pred[repeat_id], axis=0)
+            else:
+                Y_test_pred = None
 
-        status = StatusType.SUCCESS
+        # # as targets do change within repeats
+        # Y_targets = self.y_train.copy() # self.get_sorted_preds(y_targets, -1)
+        # Y_train_targets = self.y_train.copy() # self.get_sorted_train_targets(y_train_targets, -1)
+
+        # Average prediction values accross repeats
+        Y_train_pred = np.nanmean(Y_train_pred[:total_repeats], axis=0)
+        Y_optimization_pred = np.nanmean(Y_optimization_pred[:total_repeats], axis=0)
+        Y_valid_pred = np.nanmean(Y_valid_pred[:total_repeats], axis=0) if Y_valid_pred is not None else None
+        Y_test_pred = np.nanmean(Y_test_pred[:total_repeats], axis=0) if Y_test_pred is not None else None
+
+        self.Y_optimization = self.y_train # np.array(Y_targets)
+        self.Y_actual_train = self.y_train # np.array(Y_train_targets)
+
+        self.pipeline = self._get_pipeline()
 
-        self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{},"
-                            " status: {},\nadditional run info:\n{}".format(self.num_run,
-                                                                            loss,
-                                                                            dict_repr(additional_run_info),
-                                                                            status))
+        train_loss = self._loss(self.Y_actual_train, Y_train_pred)
+        opt_loss = self._loss(self.Y_optimization, Y_optimization_pred)
+
+        status = StatusType.SUCCESS
+        self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format(
+            self.num_run,
+            opt_loss
+        ))
         self.finish_up(
-            loss=loss,
+            loss=opt_loss,
             train_loss=train_loss,
-            ensemble_opt_pred=y_ensemble_opt_pred,
-            valid_pred=y_valid_pred,
-            test_pred=y_test_pred,
+            opt_pred=Y_optimization_pred,
+            valid_pred=Y_valid_pred,
+            test_pred=Y_test_pred,
             additional_run_info=additional_run_info,
             file_output=True,
             status=status,
-            pipeline_opt_pred=y_pipeline_opt_pred
         )
 
-
     def _fit_and_predict(
         self,
         pipeline: BaseEstimator,
         fold: int,
+        repeat_id: int,
         train_indices: Union[np.ndarray, List],
         test_indices: Union[np.ndarray, List],
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]:
@@ -344,12 +481,15 @@ def _fit_and_predict(
         X = {'train_indices': train_indices,
              'val_indices': test_indices,
              'split_id': fold,
+             'repeat_id': repeat_id,
              'num_run': self.num_run,
              **self.fit_dictionary}  # fit dictionary
         y = None
         fit_and_suppress_warnings(self.logger, pipeline, X, y)
         self.logger.info("Model fitted, now predicting")
-        Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds = self._predict(
+        (
+            Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
+        ) = self._predict(
             pipeline,
             train_indices=train_indices,
             test_indices=test_indices,
@@ -357,7 +497,7 @@ def _fit_and_predict(
 
         self.pipeline = pipeline
 
-        return Y_train_pred, Y_pipeline_opt_pred, Y_ensemble_opt_pred, Y_valid_pred, Y_test_pred, Y_ensemble_preds
+        return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
 
     def _predict(
         self,
@@ -368,19 +508,9 @@ def _predict(
         train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
                                            self.y_train[train_indices])
 
-        pipeline_opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
+        opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
                                          self.y_train[train_indices])
 
-        ensemble_dir = self.backend.get_ensemble_dir()
-        if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1:
-            old_ensemble = self.backend.load_ensemble(self.seed)
-            assert isinstance(old_ensemble, StackingEnsemble)
-            ensemble_opt_pred = old_ensemble.predict_with_current_pipeline(pipeline_opt_pred)
-            ensemble_preds = old_ensemble.get_ensemble_predictions_with_current_pipeline(pipeline_opt_pred)
-        else:
-            ensemble_opt_pred = pipeline_opt_pred.copy()
-            ensemble_preds = [pipeline_opt_pred]
-
         # self.logger.debug(f"for model {self.seed}_{self.num_run}_{self.budget} ensemble_predictions are {ensemble_opt_pred}")
         if self.X_valid is not None:
             valid_pred = self.predict_function(self.X_valid, pipeline,
@@ -394,11 +524,11 @@ def _predict(
         else:
             test_pred = None
 
-        return train_pred, pipeline_opt_pred, ensemble_opt_pred, valid_pred, test_pred, ensemble_preds
+        return train_pred, opt_pred, valid_pred, test_pred
 
 
 # create closure for evaluating an algorithm
-def eval_function(
+def eval_repeated_cv_function(
     backend: Backend,
     queue: Queue,
     metric: autoPyTorchMetric,
@@ -418,6 +548,7 @@ def eval_function(
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     use_ensemble_opt_loss=False,
     instance: str = None,
+    cur_stacking_layer: int = 0,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
@@ -481,7 +612,7 @@ def eval_function(
             This instance is a compatibility argument for SMAC, that is capable of working
             with multiple datasets at the same time.
     """
-    evaluator = StackingEvaluator(
+    evaluator = RepeatedCrossValEvaluator(
         backend=backend,
         queue=queue,
         metric=metric,
@@ -499,6 +630,7 @@ def eval_function(
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
         search_space_updates=search_space_updates,
-        use_ensemble_opt_loss=use_ensemble_opt_loss
+        use_ensemble_opt_loss=use_ensemble_opt_loss,
+        cur_stacking_layer=cur_stacking_layer,
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 4ac84c8ef..fe8513aec 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -28,9 +28,11 @@
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
-    NoResamplingStrategyTypes
+    NoResamplingStrategyTypes,
+    RepeatedCrossValTypes
 )
-import autoPyTorch.evaluation.stacking_evaluator
+from autoPyTorch.evaluation.ensemble_optimisation_evaluator import eval_ensemble_optimise_function
+from autoPyTorch.evaluation.repeated_crossval_evaluator import eval_repeated_cv_function
 from autoPyTorch.evaluation.test_evaluator import eval_test_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
@@ -131,8 +133,9 @@ def __init__(
         logger_port: int = None,
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        ensemble_method = None,
-        use_ensemble_opt_loss=False
+        ensemble_method: EnsembleSelectionTypes = None,
+        use_ensemble_opt_loss=False,
+        cur_stacking_layer: int = 0
     ):
 
         self.backend = backend
@@ -151,10 +154,25 @@ def __init__(
         self.resampling_strategy_args = dm.resampling_strategy_args
 
         if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
-            if ensemble_method is None or ensemble_method == EnsembleSelectionTypes.ensemble_selection:
-                eval_function = eval_train_function
-            elif ensemble_method == EnsembleSelectionTypes.stacking_ensemble:
-                eval_function = autoPyTorch.evaluation.stacking_evaluator.eval_function
+            eval_function = eval_train_function
+            if (
+                ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble
+                or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models
+                or ensemble_method == EnsembleSelectionTypes.stacking_autogluon
+                or ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer
+            ):
+                raise ValueError(f"fitting ensemble stacking requires resampling strategy to be of {RepeatedCrossValTypes} but got {self.resampling_strategy}")
+        elif isinstance(self.resampling_strategy, RepeatedCrossValTypes):
+            if ensemble_method == EnsembleSelectionTypes.stacking_optimisation_ensemble:
+                eval_function = eval_ensemble_optimise_function
+            elif (
+                ensemble_method == EnsembleSelectionTypes.stacking_ensemble_selection_per_layer
+                or ensemble_method == EnsembleSelectionTypes.stacking_repeat_models
+                or ensemble_method == EnsembleSelectionTypes.stacking_autogluon
+                or ensemble_method is None
+                or ensemble_method == EnsembleSelectionTypes.ensemble_selection
+            ):
+                eval_function = eval_repeated_cv_function
             self.output_y_hat_optimization = output_y_hat_optimization
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
             eval_function = eval_test_function
@@ -209,6 +227,7 @@ def __init__(
         self.memory_limit = memory_limit
 
         self.search_space_updates = search_space_updates
+        self.cur_stacking_layer = cur_stacking_layer
         self.use_ensemble_opt_loss = use_ensemble_opt_loss
 
     def _check_and_get_default_budget(self) -> float:
@@ -349,7 +368,8 @@ def run(
             logger_port=self.logger_port,
             all_supported_metrics=self.all_supported_metrics,
             search_space_updates=self.search_space_updates,
-            use_ensemble_opt_loss=self.use_ensemble_opt_loss
+            use_ensemble_opt_loss=self.use_ensemble_opt_loss,
+            cur_stacking_layer=self.cur_stacking_layer
         )
 
         info: Optional[List[RunValue]]
@@ -511,3 +531,4 @@ def run(
             )
         )
         return status, cost, runtime, additional_run_info
+
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index e5884a9f7..891fdeb46 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -132,7 +132,8 @@ def __init__(self, backend: Backend, queue: Queue,
                  keep_models: Optional[bool] = None,
                  all_supported_metrics: bool = True,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-                 use_ensemble_opt_loss=False) -> None:
+                 use_ensemble_opt_loss=False,
+                 cur_stacking_layer: int = 0) -> None:
         super().__init__(
             backend=backend,
             queue=queue,
@@ -160,7 +161,7 @@ def __init__(self, backend: Backend, queue: Queue,
                 f'(CrossValTypes, HoldoutValTypes), but got {self.resampling_strategy}'
             )
 
-        self.num_folds: int = len(self.splits)
+        self.num_folds: int = len(self.splits[0])
         self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
         self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
         self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
@@ -177,14 +178,15 @@ def fit_predict_and_loss(self) -> None:
         additional_run_info: Optional[Dict] = None
         if self.num_folds == 1:
             split_id = 0
+            repeat_id = 0
             self.logger.info("Starting fit {}".format(split_id))
 
             pipeline = self._get_pipeline()
 
-            train_split, test_split = self.splits[split_id]
+            train_split, test_split = self.splits[repeat_id][split_id]
             self.Y_optimization = self.y_train[test_split]
             self.Y_actual_train = self.y_train[train_split]
-            y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
+            y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, repeat_id,
                                                                                         train_indices=train_split,
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
@@ -231,11 +233,12 @@ def fit_predict_and_loss(self) -> None:
             opt_fold_weights = [np.NaN] * self.num_folds
 
             additional_run_info = {}
+            repeat_id = 0
 
-            for i, (train_split, test_split) in enumerate(self.splits):
+            for i, (train_split, test_split) in enumerate(self.splits[repeat_id]):
 
                 pipeline = self.pipelines[i]
-                train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
+                train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i, repeat_id,
                                                                                     train_indices=train_split,
                                                                                     test_indices=test_split,
                                                                                     add_pipeline_to_self=False)
@@ -350,7 +353,9 @@ def fit_predict_and_loss(self) -> None:
                 status=status,
             )
 
-    def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List],
+    def _fit_and_predict(self, pipeline: BaseEstimator, fold: int,
+                         repeat_id: int,
+                         train_indices: Union[np.ndarray, List],
                          test_indices: Union[np.ndarray, List],
                          add_pipeline_to_self: bool
                          ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
@@ -362,6 +367,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
         X = {'train_indices': train_indices,
              'val_indices': test_indices,
              'split_id': fold,
+             'repeat_id': repeat_id,
              'num_run': self.num_run,
              **self.fit_dictionary}  # fit dictionary
         y = None
@@ -431,6 +437,7 @@ def eval_train_function(
     all_supported_metrics: bool = True,
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     use_ensemble_opt_loss=False,
+    cur_stacking_layer: int = 0,
     instance: str = None,
 ) -> None:
     """
@@ -513,6 +520,7 @@ def eval_train_function(
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
         search_space_updates=search_space_updates,
-        use_ensemble_opt_loss=use_ensemble_opt_loss
+        use_ensemble_opt_loss=use_ensemble_opt_loss,
+        cur_stacking_layer=cur_stacking_layer
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py
index 37e5fa36d..094e373ac 100644
--- a/autoPyTorch/evaluation/utils.py
+++ b/autoPyTorch/evaluation/utils.py
@@ -2,6 +2,8 @@
 from multiprocessing.queues import Queue
 from typing import List, Optional, Union
 
+from ConfigSpace.configuration_space import Configuration
+
 import numpy as np
 
 from sklearn.ensemble import VotingRegressor
@@ -20,6 +22,13 @@
 ]
 
 
+def check_pipeline_is_fitted(pipeline, configuration):
+    if isinstance(configuration, Configuration):
+        return hasattr(pipeline.named_steps['network'], 'is_fitted_') and pipeline.named_steps['network'].is_fitted_
+    else:
+        return pipeline.is_fitted_
+
+
 def read_queue(queue_: Queue) -> List[RunValue]:
     stack: List[RunValue] = []
     while True:
diff --git a/autoPyTorch/optimizer/run_history_callback.py b/autoPyTorch/optimizer/run_history_callback.py
index 376478813..1ee56666e 100644
--- a/autoPyTorch/optimizer/run_history_callback.py
+++ b/autoPyTorch/optimizer/run_history_callback.py
@@ -244,6 +244,7 @@ def run(self, iteration: int) -> Optional[List[Tuple[RunKey, float]]]:
             try:
                 with (open(self.ensemble_loss_file, "rb")) as memory:
                     read_losses = pickle.load(memory)
+                    self.logger.debug(f"read losses at iteration: {iteration}: {read_losses.keys()}")
             except Exception as e:
                 self.logger.debug(f"Could not read losses at iteration: {iteration} with exception {e}")
                 return None
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 47fb4e619..5cad8c10c 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -2,34 +2,44 @@
 import json
 import logging.handlers
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import os
 
 import ConfigSpace
 from ConfigSpace.configuration_space import Configuration
 
 import dask.distributed
 
+import numpy as np
+
 from smac.facade.smac_ac_facade import SMAC4AC
 from smac.intensification.hyperband import Hyperband
 from smac.optimizer.smbo import SMBO
-from smac.runhistory.runhistory import RunHistory
+from smac.runhistory.runhistory import RunHistory, DataOrigin
 from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
 from smac.scenario.scenario import Scenario
 from smac.tae.dask_runner import DaskParallelRunner
 from smac.tae.serial_runner import SerialRunner
 from smac.utils.io.traj_logging import TrajEntry
 
+from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
+    ResamplingStrategies,
     DEFAULT_RESAMPLING_PARAMETERS,
     HoldoutValTypes,
-    NoResamplingStrategyTypes
+    CrossValTypes
 )
+from autoPyTorch.datasets.utils import get_appended_dataset
 from autoPyTorch.ensemble.ensemble_builder_manager import EnsembleBuilderManager
+from autoPyTorch.ensemble.ensemble_optimisation_stacking_ensemble import EnsembleOptimisationStackingEnsemble
+from autoPyTorch.ensemble.ensemble_selection_per_layer_stacking_ensemble import EnsembleSelectionPerLayerStackingEnsemble
 from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
-from autoPyTorch.optimizer.utils import read_return_initial_configurations
+from autoPyTorch.optimizer.utils import delete_other_runs, read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import get_named_client_logger
 from autoPyTorch.utils.stopwatch import StopWatch
@@ -102,9 +112,7 @@ def __init__(self,
                  pipeline_config: Dict[str, Any],
                  start_num_run: int = 1,
                  seed: int = 1,
-                 resampling_strategy: Union[HoldoutValTypes,
-                                            CrossValTypes,
-                                            NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
+                 resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
@@ -113,13 +121,14 @@ def __init__(self,
                  get_smac_object_callback: Optional[Callable] = None,
                  all_supported_metrics: bool = True,
                  ensemble_callback: Optional[EnsembleBuilderManager] = None,
+                 num_stacking_layers: Optional[int] = None,
                  logger_port: Optional[int] = None,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  portfolio_selection: Optional[str] = None,
                  pynisher_context: str = 'spawn',
                  min_budget: int = 5,
                  max_budget: int = 50,
-                 ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
+                 ensemble_method: EnsembleSelectionTypes = EnsembleSelectionTypes.ensemble_selection,
                  other_callbacks: Optional[List] = None,
                  smbo_class: Optional[SMBO] = None,
                  use_ensemble_opt_loss: bool = False
@@ -200,6 +209,7 @@ def __init__(self,
         """
         super(AutoMLSMBO, self).__init__()
         # data related
+        self.datamanager: Optional[BaseDataset] = None
         self.dataset_name = dataset_name
         self.metric = metric
 
@@ -239,6 +249,13 @@ def __init__(self,
         self.ensemble_method = ensemble_method
 
         self.ensemble_callback = ensemble_callback
+        if self.ensemble_method.is_stacking_ensemble()  and num_stacking_layers is None:
+            raise ValueError("'num_stacking_layers' can't be none for stacked ensembles")
+
+        self.num_stacking_layers = num_stacking_layers
+
+        self.run_history = RunHistory()
+        self.trajectory: List[TrajEntry] = []
 
         self.other_callbacks = other_callbacks
         self.smbo_class = smbo_class
@@ -265,18 +282,46 @@ def __init__(self,
                 self.logger.warning("None of the portfolio configurations are compatible"
                                     " with the current search space. Skipping initial configuration...")
 
-    def run_smbo(self, func: Optional[Callable] = None
-                 ) -> Tuple[RunHistory, List[TrajEntry], str]:
-
-        self.watcher.start_task('SMBO')
-        self.logger.info("Started run of SMBO")
+    def reset_data_manager(self) -> None:
+        if self.datamanager is not None:
+            del self.datamanager
+        self.datamanager = self.backend.load_datamanager()
+        if self.datamanager is not None and self.datamanager.task_type is not None:
+            self.task = self.datamanager.task_type
+
+    def reset_attributes(self, datamanager: BaseDataset) -> None:
+        self.backend.save_datamanager(datamanager=datamanager)
+
+        dataset_requirements = get_dataset_requirements(
+            info=datamanager.get_required_dataset_info(),
+            include=self.include,
+            exclude=self.exclude,
+            search_space_updates=self.search_space_updates)
+        self._dataset_requirements = dataset_requirements
+        dataset_properties = datamanager.get_dataset_properties(dataset_requirements)
+        self.config_space = get_configuration_space(dataset_properties, include=self.include, exclude=self.exclude, search_space_updates=self.search_space_updates)
+
+    def _run_smbo(
+        self,
+        cur_stacking_layer: int,
+        walltime_limit: int,
+        initial_num_run: int,
+        func: Optional[Callable] = None,
+        ) -> Tuple[RunHistory, List[TrajEntry], str]:
+
+        current_task_name = f'SMBO_{cur_stacking_layer}'
+
+        self.watcher.start_task(current_task_name)
+        self.logger.info(f"Started {cur_stacking_layer} run of SMBO")
+
+        # # == first things first: load the datamanager
+        # self.reset_data_manager()
 
         # == Initialize non-SMBO stuff
         # first create a scenario
         seed = self.seed
         self.config_space.seed(seed)
         # allocate a run history
-        num_run = self.start_num_run
 
         # Initialize some SMAC dependencies
 
@@ -295,7 +340,7 @@ def run_smbo(self, func: Optional[Callable] = None
         ta_kwargs = dict(
             backend=copy.deepcopy(self.backend),
             seed=seed,
-            initial_num_run=num_run,
+            initial_num_run=initial_num_run,
             include=self.include if self.include is not None else dict(),
             exclude=self.exclude if self.exclude is not None else dict(),
             metric=self.metric,
@@ -308,13 +353,14 @@ def run_smbo(self, func: Optional[Callable] = None
             search_space_updates=self.search_space_updates,
             pynisher_context=self.pynisher_context,
             ensemble_method=self.ensemble_method,
-            use_ensemble_opt_loss=self.use_ensemble_opt_loss
+            use_ensemble_opt_loss=self.use_ensemble_opt_loss,
+            cur_stacking_layer=cur_stacking_layer
         )
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
 
-        startup_time = self.watcher.wall_elapsed(self.dataset_name)
-        total_walltime_limit = self.total_walltime_limit - startup_time - 5
+        startup_time = self.watcher.wall_elapsed(current_task_name)
+        walltime_limit = walltime_limit - startup_time - 5
         scenario_dict = {
             'abort_on_first_run_crash': False,
             'cs': self.config_space,
@@ -324,7 +370,7 @@ def run_smbo(self, func: Optional[Callable] = None
             'memory_limit': self.memory_limit,
             'output-dir': self.backend.get_smac_output_directory(),
             'run_obj': 'quality',
-            'wallclock_limit': total_walltime_limit,
+            'wallclock_limit': walltime_limit,
             'cost_for_crash': self.worst_possible_result,
         }
         if self.smac_scenario_args is not None:
@@ -365,7 +411,8 @@ def run_smbo(self, func: Optional[Callable] = None
                                                  initial_budget=self.min_budget,
                                                  max_budget=self.max_budget,
                                                  dask_client=self.dask_client,
-                                                 initial_configurations=self.initial_configurations)
+                                                 initial_configurations=self.initial_configurations,
+                                                 smbo_class=self.smbo_class)
         else:
             smac = get_smac_object(scenario_dict=scenario_dict,
                                    seed=seed,
@@ -378,20 +425,22 @@ def run_smbo(self, func: Optional[Callable] = None
                                    initial_configurations=self.initial_configurations,
                                    smbo_class=self.smbo_class)
 
+        if self.ensemble_method.is_stacking_ensemble():
+            self.ensemble_callback.update_for_new_stacking_layer(cur_stacking_layer, initial_num_run)
         if self.ensemble_callback is not None:
             smac.register_callback(self.ensemble_callback)
-
         if self.other_callbacks is not None:
             for callback in self.other_callbacks:
                 smac.register_callback(callback)
+
         self.logger.info("initialised SMBO, running SMBO.optimize()")
 
         smac.optimize()
 
         self.logger.info("finished SMBO.optimize()")
 
-        self.runhistory = smac.solver.runhistory
-        self.trajectory = smac.solver.intensifier.traj_logger.trajectory
+        runhistory = smac.solver.runhistory
+        trajectory = smac.solver.intensifier.traj_logger.trajectory
         if isinstance(smac.solver.tae_runner, DaskParallelRunner):
             self._budget_type = smac.solver.tae_runner.single_worker.budget_type
         elif isinstance(smac.solver.tae_runner, SerialRunner):
@@ -399,4 +448,55 @@ def run_smbo(self, func: Optional[Callable] = None
         else:
             raise NotImplementedError(type(smac.solver.tae_runner))
 
-        return self.runhistory, self.trajectory, self._budget_type
+        return runhistory, trajectory, self._budget_type
+
+    def run_smbo(self, func: Optional[Callable] = None
+                 ) -> Tuple[RunHistory, List[TrajEntry], str]:
+        individual_wall_times = self.total_walltime_limit / self.num_stacking_layers
+        initial_num_run = self.start_num_run
+        self.reset_data_manager()
+        for cur_stacking_layer in range(self.num_stacking_layers):
+            if cur_stacking_layer == 0:
+                self.logger.debug(f"Initial feat_types = {self.datamanager.feat_type}")
+            run_history, trajectory, _ = self._run_smbo(
+                walltime_limit=individual_wall_times,
+                cur_stacking_layer=cur_stacking_layer,
+                initial_num_run=initial_num_run,
+                func=func
+                )
+            self.run_history.update(run_history, origin=DataOrigin.INTERNAL)
+            self.trajectory.extend(trajectory)
+            if self.num_stacking_layers <= 1:
+                break 
+            old_ensemble: Optional[Union[EnsembleSelectionPerLayerStackingEnsemble, EnsembleOptimisationStackingEnsemble]] = None
+            ensemble_dir = self.backend.get_ensemble_dir()
+            if os.path.exists(ensemble_dir) and len(os.listdir(ensemble_dir)) >= 1:
+                old_ensemble = self.backend.load_ensemble(self.seed)
+                assert isinstance(old_ensemble, (EnsembleOptimisationStackingEnsemble, EnsembleSelectionPerLayerStackingEnsemble))
+            if cur_stacking_layer != self.num_stacking_layers -1:
+                selected_identifiers = old_ensemble.get_selected_model_identifiers()[old_ensemble.cur_stacking_layer]
+                nonnull_identifiers = [identifier for identifier in selected_identifiers if identifier is not None]
+                ensemble_runs = [self.backend.get_numrun_directory(seed=seed, num_run=num_run, budget=budget).split('/')[-1] for seed, num_run, budget in nonnull_identifiers]
+                self.logger.debug(f"deleting runs other than {ensemble_runs}")
+                delete_other_runs(ensemble_runs=ensemble_runs, runs_directory=self.backend.get_runs_directory())
+            previous_layer_predictions_train = old_ensemble.get_layer_stacking_ensemble_predictions(stacking_layer=cur_stacking_layer)
+            previous_layer_predictions_test = old_ensemble.get_layer_stacking_ensemble_predictions(stacking_layer=cur_stacking_layer, dataset='test')
+            self.logger.debug(f"Original feat types len: {len(self.datamanager.feat_type)}")
+            nonnull_model_predictions_train = [pred for pred in previous_layer_predictions_train if pred is not None]
+            nonnull_model_predictions_test = [pred for pred in previous_layer_predictions_test if pred is not None]
+            assert len(nonnull_model_predictions_train) == len(nonnull_model_predictions_test)
+            self.logger.debug(f"length Non nulll predictions: {len(nonnull_model_predictions_train)}")
+            datamanager = get_appended_dataset(
+                original_dataset=self.datamanager,
+                previous_layer_predictions_train=nonnull_model_predictions_train,
+                previous_layer_predictions_test=nonnull_model_predictions_test,
+                resampling_strategy=self.resampling_strategy,
+                resampling_strategy_args=self.resampling_strategy_args,
+            )
+            self.logger.debug(f"new feat_types len: {len(datamanager.feat_type)}")
+            self.reset_attributes(datamanager=datamanager)
+
+            initial_num_run = self.backend.get_next_num_run()
+            self.logger.debug(f"cutoff num_run: {initial_num_run}")
+
+        return self.run_history, self.trajectory, self._budget_type
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index c44252021..23af08c0f 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -1,5 +1,6 @@
 import json
 import os
+import shutil
 import warnings
 from typing import Any, Dict, List, Union
 
@@ -48,6 +49,14 @@ def read_return_initial_configurations(
                           f"configuration as it does not match the current config space. ")
     return initial_configurations
 
+
+def delete_other_runs(ensemble_runs, runs_directory):
+    all_runs = os.listdir(runs_directory)
+    for run in all_runs:
+        if run not in ensemble_runs:
+            shutil.rmtree(os.path.join(runs_directory, run))
+
+
 class AdjustRunHistoryCallback:
     """
     Allows manipulating run history for custom needs
@@ -55,6 +64,7 @@ class AdjustRunHistoryCallback:
     def __call__(self, smbo: 'SMBO') -> RunHistory:
         pass
 
+
 class autoPyTorchSMBO(SMBO):
     def __init__(self,
                  scenario: Scenario,
@@ -135,7 +145,6 @@ def _incorporate_run_results(self, run_info: RunInfo, result: RunValue, time_lef
                     "configuration does not crashes. (To deactivate this exception, use the SMAC scenario option "
                     "'abort_on_first_run_crash'). Additional run info: %s" % result.additional_info
                 )
-        self.logger.debug(f"\nbefore ensemble, result: {result}, \nrunhistory: {self.runhistory.data}")
         for callback in self._callbacks['_incorporate_run_results']:
             response = callback(smbo=self, run_info=run_info, result=result, time_left=time_left)
             # If a callback returns False, the optimization loop should be interrupted
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index fe9727502..f87d36cf7 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -310,33 +310,6 @@ def _add_forbidden_conditions(self, cs):
 
         """
 
-        # Learned Entity Embedding is only valid when encoder is one hot encoder
-        if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
-            embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
-            if 'LearnedEntityEmbedding' in embeddings:
-                encoders = cs.get_hyperparameter('encoder:__choice__').choices
-                possible_default_embeddings = copy(list(embeddings))
-                del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')]
-
-                for encoder in encoders:
-                    if encoder == 'OneHotEncoder':
-                        continue
-                    while True:
-                        try:
-                            cs.add_forbidden_clause(ForbiddenAndConjunction(
-                                ForbiddenEqualsClause(cs.get_hyperparameter(
-                                    'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
-                                ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
-                            ))
-                            break
-                        except ValueError:
-                            # change the default and try again
-                            try:
-                                default = possible_default_embeddings.pop()
-                            except IndexError:
-                                raise ValueError("Cannot find a legal default configuration")
-                            cs.get_hyperparameter('network_embedding:__choice__').default_value = default
-
         # Disable CyclicLR until todo is completed.
         if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys():
             trainers = cs.get_hyperparameter('trainer:__choice__').choices
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index 6b38b4650..bfb54610e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -23,7 +23,10 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.preprocessor: Optional[ColumnTransformer] = None
         self.add_fit_requirements([
             FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('skew_columns', (List,), user_defined=True, dataset_property=False),
+            FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False),
+            FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)])
 
     def get_column_transformer(self) -> ColumnTransformer:
         """
@@ -63,6 +66,21 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
             column_transformers.append(
                 ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
             )
+        if len(preprocessors['skew']) > 0:
+            skew_pipeline = make_pipeline(*preprocessors['skew'])
+            column_transformers.append(
+                ('skew_pipeline', skew_pipeline, X['skew_columns'])
+            )
+        if len(preprocessors['encode']) > 0:
+            encode_pipeline = make_pipeline(*preprocessors['encode'])
+            column_transformers.append(
+                ('encode_pipeline', encode_pipeline, X['encode_columns'])
+            )
+        if len(preprocessors['scale']) > 0:
+            scale_pipeline = make_pipeline(*preprocessors['scale'])
+            column_transformers.append(
+                ('scale_pipeline', scale_pipeline, X['scale_columns'])
+            )
 
         # in case the preprocessing steps are disabled
         # i.e, NoEncoder for categorical, we want to
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py
index aefe9ddf8..18d7f815e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py
@@ -14,7 +14,7 @@ class autoPyTorchTabularPreprocessingComponent(autoPyTorchPreprocessingComponent
     def __init__(self) -> None:
         super().__init__()
         self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
-            numerical=None, categorical=None)
+            numerical=None, encode=None, skew=None, scale=None, categorical=None)
 
     def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
         """
@@ -26,9 +26,6 @@ def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
         Returns:
             Dict[str, BaseEstimator]: early_preprocessor dictionary
         """
-        if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None:
-            raise AttributeError("{} can't return early_preprocessor dict without fitting first"
-                                 .format(self.__class__.__name__))
         return self.preprocessor
 
     def __str__(self) -> str:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py
new file mode 100644
index 000000000..0333c3cab
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py
@@ -0,0 +1,100 @@
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+)
+
+import pandas as pd
+
+import numpy as np
+
+from scipy.stats import skew
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
+    autoPyTorchTabularPreprocessingComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas
+
+
+def _get_skew(
+    data: Union[np.ndarray, pd.DataFrame]
+)->float:
+    return data.skew() if ispandas(data) else skew(data)
+
+class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):
+    """
+    Removes features that have the same value in the training data.
+    """
+    def __init__(
+        self,
+        min_categories_for_embedding: float = 5,
+        skew_threshold: float = 0.99,
+        random_state: Optional[np.random.RandomState] = None
+    ):
+        self.min_categories_for_embedding = min_categories_for_embedding
+        self.skew_threshold = skew_threshold
+
+        self.special_feature_types = dict(skew_columns=[], encode_columns=[], embed_columns=[], scale_columns=[])
+        self.num_categories_per_col: Optional[List] = None
+        super().__init__()
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter':
+
+        self.check_requirements(X, y)
+
+        if len(X['dataset_properties']['categorical_columns']) > 0:
+            self.num_categories_per_col = []
+        for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']):
+            if (
+                categories_per_column >= self.min_categories_for_embedding
+            ):
+                self.special_feature_types['embed_columns'].append(column)
+                self.num_categories_per_col.append(categories_per_column)
+            else:
+                self.special_feature_types['encode_columns'].append(column)
+
+        # Make sure each column is a valid type
+        for column in X['dataset_properties']['numerical_columns']:
+
+                if np.abs(_get_skew(X['X_train'][X['train_indices']][column])) > self.skew_threshold:
+                    self.special_feature_types['skew_columns'].append(column)
+                else:
+                    self.special_feature_types['scale_columns'].append(column)
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.num_categories_per_col is not None:
+            X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col
+        X.update(self.special_feature_types)
+        return X
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+
+        return {
+            'shortname': 'ColumnSplitter',
+            'name': 'Column Splitter',
+            'handles_sparse': False,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="min_categories_for_embedding",
+            value_range=(3, 4, 10, 100, 1000),
+            default_value=4),
+        skew_threshold: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skew_threshold",
+                                                                              value_range=(0.2, 0.3, 0.5, 0.8, 0.99, 10.0, 100.0),
+                                                                              default_value=0.99,)
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, min_categories_for_embedding, CategoricalHyperparameter)
+        add_hyperparameter(cs, skew_threshold, CategoricalHyperparameter)
+
+        return cs
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py
index 929e99048..341cc5065 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py
@@ -31,18 +31,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:
 
         return self
 
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Adds the self into the 'X' dictionary and returns it.
-        Args:
-            X (Dict[str, Any]): 'X' dictionary
-
-        Returns:
-            (Dict[str, Any]): the updated 'X' dictionary
-        """
-        X.update({'encoder': self.preprocessor})
-        return X
-
     @staticmethod
     def get_properties(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py
index 5c9281891..b91387d66 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py
@@ -20,12 +20,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:
 
         self.check_requirements(X, y)
 
-        self.preprocessor['categorical'] = OHE(
-            # It is safer to have the OHE produce a 0 array than to crash a good configuration
-            categories=X['dataset_properties']['categories']
-            if len(X['dataset_properties']['categories']) > 0 else 'auto',
-            sparse=False,
-            handle_unknown='ignore')
+        if self._has_encode_columns(X):
+            self.preprocessor['encode'] = OHE(
+                # It is safer to have the OHE produce a 0 array than to crash a good configuration
+                sparse=False,
+                handle_unknown='ignore')
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
index eadc0a188..b62822107 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
@@ -13,8 +13,11 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False)])
+
+    @staticmethod
+    def _has_encode_columns(X: Dict[str, Any]):
+        return len(X.get('encode_columns', [])) > 0
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -25,8 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
-            raise ValueError("cant call transform on {} without fitting first."
-                             .format(self.__class__.__name__))
         X.update({'encoder': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py
index 97766217b..7f19f44d6 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/MinMaxScaler.py
@@ -23,7 +23,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
 
         self.check_requirements(X, y)
 
-        self.preprocessor['numerical'] = SklearnMinMaxScaler(feature_range=self.feature_range, copy=False)
+        if self._has_scale_columns(X):
+            self.preprocessor['scale'] = SklearnMinMaxScaler(feature_range=self.feature_range, copy=False)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py
index 9d50aa8f5..e5fc369f0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py
@@ -32,20 +32,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
 
         return self
 
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        The transform function calls the transform function of the
-        underlying model and returns the transformed array.
-
-        Args:
-            X (np.ndarray): input features
-
-        Returns:
-            np.ndarray: Transformed features
-        """
-        X.update({'scaler': self.preprocessor})
-        return X
-
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py
index 678071378..cb6e2daf4 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/Normalizer.py
@@ -34,7 +34,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
         map_norm = dict({"mean_abs": "l1", "mean_squared": "l2", "max": "max"})
-        self.preprocessor['numerical'] = SklearnNormalizer(norm=map_norm[self.norm], copy=False)
+        if self._has_scale_columns(X):
+            self.preprocessor['scale'] = SklearnNormalizer(norm=map_norm[self.norm], copy=False)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
index 2c59d77c2..5d18794b7 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
@@ -40,7 +40,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
         with_centering = bool(not X['dataset_properties']['issparse'])
 
-        self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
+        if self._has_scale_columns(X):
+            self.preprocessor['scale'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
                                                              with_centering=with_centering,
                                                              copy=False)
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py
index 664f45e04..173b959fa 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/StandardScaler.py
@@ -27,7 +27,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
         with_mean, with_std = (False, False) if X['dataset_properties']['issparse'] else (True, True)
-        self.preprocessor['numerical'] = SklearnStandardScaler(with_mean=with_mean, with_std=with_std, copy=False)
+        if self._has_scale_columns(X):
+            self.preprocessor['scale'] = SklearnStandardScaler(with_mean=with_mean, with_std=with_std, copy=False)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
index 39834dd2b..f9f43c58f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
@@ -14,8 +14,12 @@ class BaseScaler(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('scale_columns', (List,), user_defined=True, dataset_property=False)])
 
+    @staticmethod
+    def _has_scale_columns(X: Dict[str, Any]):
+        return len(X.get('scale_columns', [])) > 0
+    
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the fitted scalar into the 'X' dictionary and returns it.
@@ -25,8 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
-            raise ValueError("cant call transform on {} without fitting first."
-                             .format(self.__class__.__name__))
         X.update({'scaler': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py
new file mode 100644
index 000000000..9ea4801e8
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/NoSkewTransformer.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer
+
+
+class NoSkewTransformer(BaseSkewTransformer):
+    """
+    No scaling performed
+    """
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer:
+        """
+        The fit function calls the fit function of the underlying model
+        and returns the transformed array.
+        Args:
+            X (np.ndarray): input features
+            y (Optional[np.ndarray]): input labels
+
+        Returns:
+            instance of self
+        """
+
+        self.check_requirements(X, y)
+
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NoSkewTransformer',
+            'name': 'No Skew Transformer',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py
similarity index 76%
rename from autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
rename to autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py
index 7dd2502f9..0cd231666 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/PowerTransformer.py
@@ -5,10 +5,10 @@
 from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer
 
 
-class PowerTransformer(BaseScaler):
+class PowerTransformer(BaseSkewTransformer):
     """
     Map data to as close to a Gaussian distribution as possible
     in order to reduce variance and minimize skewness.
@@ -21,11 +21,12 @@ def __init__(self,
         super().__init__()
         self.random_state = random_state
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer:
 
         self.check_requirements(X, y)
 
-        self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
+        if self._has_skew_columns(X):
+            self.preprocessor['skew'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py
similarity index 90%
rename from autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
rename to autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py
index cc0b4fa7a..7bd4e5482 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/QuantileTransformer.py
@@ -11,11 +11,11 @@
 from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
-class QuantileTransformer(BaseScaler):
+class QuantileTransformer(BaseSkewTransformer):
     """
     Transform the features to follow a uniform or a normal distribution
     using quantiles information.
@@ -34,11 +34,12 @@ def __init__(
         self.n_quantiles = n_quantiles
         self.output_distribution = output_distribution
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseSkewTransformer:
 
         self.check_requirements(X, y)
 
-        self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
+        if self._has_skew_columns(X):
+            self.preprocessor['skew'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
                                                                     output_distribution=self.output_distribution,
                                                                     copy=False)
         return self
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py
new file mode 100644
index 000000000..421632101
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/__init__.py
@@ -0,0 +1,143 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer.base_skew_transformer import BaseSkewTransformer
+
+skew_transforming_directory = os.path.split(__file__)[0]
+_skew_transformers = find_components(__package__,
+                           skew_transforming_directory,
+                           BaseSkewTransformer)
+
+_addons = ThirdPartyComponents(BaseSkewTransformer)
+
+
+def add_skew_transformer(skew_transformer: BaseSkewTransformer) -> None:
+    _addons.add_component(skew_transformer)
+
+
+class SkewTransformerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing skew_transforming component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available skew_transformer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseSkewTransformers components available
+                as choices for skew_transforming
+        """
+        components = OrderedDict()
+        components.update(_skew_transformers)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_skew_transformers = self.get_available_components(dataset_properties=dataset_properties,
+                                                          include=include,
+                                                          exclude=exclude)
+
+        if len(available_skew_transformers) == 0:
+            raise ValueError("no skew_transformers found, please add a skew_transformer")
+
+        if default is None:
+            defaults = [
+                'PowerTransformer',
+                'QuantileTransformer',
+                'NoSkewTransformer'
+            ]
+            for default_ in defaults:
+                if default_ in available_skew_transformers:
+                    if include is not None and default_ not in include:
+                        continue
+                    if exclude is not None and default_ in exclude:
+                        continue
+                    default = default_
+                    break
+
+        numerical_columns = dataset_properties['numerical_columns']\
+            if isinstance(dataset_properties['numerical_columns'], List) else []
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_skew_transformers):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_skew_transformers,
+                                                               choice_hyperparameter.value_range))
+            if len(numerical_columns) == 0:
+                assert len(choice_hyperparameter.value_range) == 1
+                if 'NoSkewTransformer' not in choice_hyperparameter.value_range:
+                    raise ValueError("Provided {} in choices, however, the dataset "
+                                     "is incompatible with it".format(choice_hyperparameter.value_range))
+
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         choice_hyperparameter.value_range,
+                                                         default_value=choice_hyperparameter.default_value)
+        else:
+            # add only no skew_transformer to choice hyperparameters in case the dataset is only categorical
+            if len(numerical_columns) == 0:
+                default = 'NoSkewTransformer'
+                if include is not None and default not in include:
+                    raise ValueError("Provided {} in include, however, "
+                                     "the dataset is incompatible with it".format(include))
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             ['NoSkewTransformer'],
+                                                             default_value=default)
+            else:
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             list(available_skew_transformers.keys()),
+                                                             default_value=default)
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of preprocessor choices
+        for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_skew_transformers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
+                                                                                   **updates)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, config_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties:
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        assert 'numerical_columns' in dataset_properties.keys() and \
+               'categorical_columns' in dataset_properties.keys(), \
+            "Dataset properties must contain information about the type of columns"
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py
new file mode 100644
index 000000000..d62055f6f
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/skew_transformer/base_skew_transformer.py
@@ -0,0 +1,33 @@
+from typing import Any, Dict, List
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
+    autoPyTorchTabularPreprocessingComponent
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseSkewTransformer(autoPyTorchTabularPreprocessingComponent):
+    """
+    Provides abstract class interface for Scalers in AutoPytorch
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('skew_columns', (List,), user_defined=True, dataset_property=False)])
+
+    @staticmethod
+    def _has_skew_columns(X: Dict[str, Any]):
+        return len(X.get('skew_columns', [])) > 0
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the fitted scalar into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X.update({'skew_transformer': self.preprocessor})
+        return X
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py
index e71583e3e..d6d1a60da 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py
@@ -21,7 +21,8 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator
     Returns:
         (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
     """
-    preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list())
+    preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list(), scale=list(), encode=list(), skew=list())
+
     for key, value in X.items():
         if isinstance(value, dict):
             # as each preprocessor is child of BaseEstimator
@@ -29,5 +30,11 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator
                 preprocessor['numerical'].append(value['numerical'])
             if 'categorical' in value and isinstance(value['categorical'], BaseEstimator):
                 preprocessor['categorical'].append(value['categorical'])
+            if 'scale' in value and isinstance(value['scale'], BaseEstimator):
+                preprocessor['scale'].append(value['scale'])
+            if 'encode' in value and isinstance(value['encode'], BaseEstimator):
+                preprocessor['encode'].append(value['encode'])
+            if 'skew' in value and isinstance(value['skew'], BaseEstimator):
+                preprocessor['skew'].append(value['skew'])
 
     return preprocessor
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
index 597f14ca6..5b60ff4ed 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
@@ -40,7 +40,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['X_train'] = preprocess(dataset=X_train, transforms=transforms)
 
         # We need to also save the preprocess transforms for inference
-        X.update({'preprocess_transforms': transforms})
+        X.update({
+            'preprocess_transforms': transforms,
+            'shape_after_preprocessing': X['X_train'].shape[1:]
+            })
         return X
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index 7ec872b96..6b68fe973 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -36,7 +36,6 @@ def __init__(
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
         ])
-        self.network = network
         self.final_activation: Optional[torch.nn.Module] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
index 5b6e48bf1..bee4a6abc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
@@ -55,7 +55,8 @@ def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: in
 
         """
         layers.append(nn.Linear(in_features, out_features))
-        layers.append(nn.BatchNorm1d(out_features))
+        if self.config['use_batch_norm']:
+            layers.append(nn.BatchNorm1d(out_features))
         layers.append(_activations[self.config["activation"]]())
         if self.config['use_dropout']:
             layers.append(nn.Dropout(self.config["dropout_%d" % layer_id]))
@@ -86,6 +87,10 @@ def get_hyperparameter_search_space(
                                                                            value_range=(True, False),
                                                                            default_value=False,
                                                                            ),
+        use_batch_norm: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_batch_norm",
+                                                                           value_range=(True, False),
+                                                                           default_value=False,
+                                                                           ),
         num_units: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_units",
                                                                          value_range=(10, 1024),
                                                                          default_value=200,
@@ -105,6 +110,9 @@ def get_hyperparameter_search_space(
         num_groups = get_hyperparameter(num_groups, UniformIntegerHyperparameter)
         add_hyperparameter(cs, activation, CategoricalHyperparameter)
 
+        # whether to use batch normalization
+        add_hyperparameter(cs, use_batch_norm, CategoricalHyperparameter)
+
         # We can have dropout in the network for
         # better generalization
         dropout_flag = False
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index ef3cc1768..f3957bbc2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -30,7 +30,7 @@ def __init__(self,
         self.add_fit_requirements([
             FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
                            dataset_property=False),
-            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('shape_after_preprocessing', (Iterable,), user_defined=False, dataset_property=False),
             FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
             FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
         ])
@@ -49,9 +49,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             Self
         """
         self.check_requirements(X, y)
-        X_train = X['X_train']
-
-        input_shape = X_train.shape[1:]
+        input_shape = X['shape_after_preprocessing']
 
         input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
         self.input_shape = input_shape
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index 1af7ad7af..fd8f5eca5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -12,7 +12,8 @@
 _activations = {
     "relu": torch.nn.ReLU,
     "tanh": torch.nn.Tanh,
-    "sigmoid": torch.nn.Sigmoid
+    "sigmoid": torch.nn.Sigmoid,
+    "elu": torch.nn.ELU
 }
 
 
@@ -25,7 +26,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...]
     :param input_shape: shape of the input
     :return: output_shape
     """
-    placeholder = torch.randn((2, *input_shape), dtype=torch.float)
+    placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float)
     with torch.no_grad():
         output = network(placeholder)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 49ecf40b7..2a391f754 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,9 +1,11 @@
+from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
     UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
+    UniformIntegerHyperparameter,
 )
 
 import numpy as np
@@ -16,39 +18,45 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
+def get_num_output_dimensions(config, num_categs_per_feature):
+    """ Returns list of embedding sizes for each categorical variable.
+        Selects this adaptively based on training_datset.
+        Note: Assumes there is at least one embed feature.
+    """
+    max_embedding_dim = config['max_embedding_dim']
+    embed_exponent = config['embed_exponent']
+    size_factor = config['embedding_size_factor']
+    num_output_dimensions = [int(size_factor*max(
+                                                 2,
+                                                 min(max_embedding_dim,
+                                                     1.6 * num_categories**embed_exponent)))
+                             if num_categories > 0 else 1 for num_categories in num_categs_per_feature]
+    return num_output_dimensions
+
+
 class _LearnedEntityEmbedding(nn.Module):
     """ Learned entity embedding module for categorical features"""
 
-    def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int):
+    def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int):
         """
         Args:
             config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer
             num_input_features (np.ndarray): column wise information of number of output columns after transformation
                 for each categorical column and 0 for numerical columns
-            num_numerical_features (int): number of numerical features in X
+            num_features_excl_embed (int): number of features in X excluding the features that need to be embedded
         """
         super().__init__()
         self.config = config
-
-        self.num_numerical = num_numerical_features
         # list of number of categories of categorical data
         # or 0 for numerical data
-        self.num_input_features = num_input_features
-        categorical_features = self.num_input_features > 0
-
-        self.num_categorical_features = self.num_input_features[categorical_features]
-
-        self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in
-                               self.num_input_features]
-        self.num_output_dimensions = [0] * num_numerical_features
-        self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in
-                                           enumerate(self.num_categorical_features)])
-        self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in
-                                      zip(self.num_output_dimensions, self.num_input_features)]
-        self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in
-                                      zip(self.num_output_dimensions, self.embed_features,
-                                          self.num_input_features)]
-        self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
+        self.num_categories_per_col = num_categories_per_col
+        self.embed_features = self.num_categories_per_col > 0
+
+        self.num_embed_features = self.num_categories_per_col[self.embed_features]
+
+        self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col)
+
+        self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)
 
         self.ee_layers = self._create_ee_layers()
 
@@ -56,32 +64,30 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # pass the columns of each categorical feature through entity embedding layer
         # before passing it through the model
         concat_seq = []
-        last_concat = 0
+
         x_pointer = 0
         layer_pointer = 0
-        for num_in, embed in zip(self.num_input_features, self.embed_features):
+        for x_pointer, embed in enumerate(self.embed_features):
+            current_feature_slice = x[:, x_pointer]
             if not embed:
                 x_pointer += 1
+                concat_seq.append(current_feature_slice.view(-1, 1))
                 continue
-            if x_pointer > last_concat:
-                concat_seq.append(x[:, last_concat: x_pointer])
-            categorical_feature_slice = x[:, x_pointer: x_pointer + num_in]
-            concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice))
+            current_feature_slice = current_feature_slice.to(torch.int)
+            concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice))
             layer_pointer += 1
-            x_pointer += num_in
-            last_concat = x_pointer
 
-        concat_seq.append(x[:, last_concat:])
         return torch.cat(concat_seq, dim=1)
 
     def _create_ee_layers(self) -> nn.ModuleList:
         # entity embeding layers are Linear Layers
         layers = nn.ModuleList()
-        for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features,
-                                                         self.num_output_dimensions)):
+        for num_cat, embed, num_out in zip(self.num_categories_per_col,
+                                           self.embed_features,
+                                           self.num_output_dimensions):
             if not embed:
                 continue
-            layers.append(nn.Linear(num_in, num_out))
+            layers.append(nn.Embedding(num_cat, num_out))
         return layers
 
 
@@ -94,33 +100,32 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
         super().__init__(random_state=random_state)
         self.config = kwargs
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
         return _LearnedEntityEmbedding(config=self.config,
-                                       num_input_features=num_input_features,
-                                       num_numerical_features=num_numerical_features)
+                                       num_categories_per_col=num_categories_per_col,
+                                       num_features_excl_embed=num_features_excl_embed)
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter="min_unique_values_for_embedding",
-            value_range=(3, 7),
-            default_value=5,
-            log=True),
-        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
-                                                                                   value_range=(0, 1),
-                                                                                   default_value=0.5),
+        embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent",
+                                                                                   value_range=(0.56,),
+                                                                                   default_value=0.56),
+        max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim",
+                                                                                   value_range=(100,),
+                                                                                   default_value=100),
+        embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
+                                                                                     value_range=(1.0, 0.5, 1.5, 0.7, 0.6, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4),
+                                                                                     default_value=1,
+                                                                                     ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
         if dataset_properties is not None:
-            for i in range(len(dataset_properties['categorical_columns'])
-                           if isinstance(dataset_properties['categorical_columns'], List) else 0):
-                ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
-                                                                       value_range=dimension_reduction.value_range,
-                                                                       default_value=dimension_reduction.default_value,
-                                                                       log=dimension_reduction.log)
-                add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter)
+            if len(dataset_properties['categorical_columns']) > 0:
+                add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter)
+                add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter)
+                add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter)
+
         return cs
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index 830bdbb00..73d4708a0 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -24,7 +24,7 @@ class NoEmbedding(NetworkEmbeddingComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None):
         super().__init__(random_state=random_state)
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
         return _NoEmbedding()
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 998055d2b..6b88e4929 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,4 @@
-import copy
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 
@@ -8,27 +7,32 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent
+from autoPyTorch.utils.common import FitRequirement
 
 
 class NetworkEmbeddingComponent(autoPyTorchSetupComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None):
         super().__init__(random_state=random_state)
+        self.add_fit_requirements([
+            FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('shape_after_preprocessing', (Tuple,), user_defined=False, dataset_property=False)])
+
         self.embedding: Optional[nn.Module] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
-        num_numerical_columns, num_input_features = self._get_required_info_from_data(X)
+        num_features_excl_embed, num_categories_per_col = self._get_required_info_from_data(X)
 
         self.embedding = self.build_embedding(
-            num_input_features=num_input_features,
-            num_numerical_features=num_numerical_columns)
+            num_categories_per_col=num_categories_per_col,
+            num_features_excl_embed=num_features_excl_embed)
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'network_embedding': self.embedding})
         return X
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module:
         raise NotImplementedError
 
     def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
@@ -48,22 +52,16 @@ def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarr
                 number of categories for categorical columns and
                 0 for numerical columns
         """
-        # Feature preprocessors can alter numerical columns
-        if len(X['dataset_properties']['numerical_columns']) == 0:
-            num_numerical_columns = 0
-        else:
-            X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
-
-            numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-                named_transformers_['numerical_pipeline']
-            num_numerical_columns = numerical_column_transformer.transform(
-                X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+        num_cols = X['shape_after_preprocessing']
+        # only works for 2D(rows, features) tabular data
+        num_features_excl_embed = num_cols[0] - len(X['embed_columns'])
+        
+        num_categories_per_col = np.zeros(num_cols, dtype=np.int16)
 
-        num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns'])
-        num_input_feats = np.zeros(num_cols, dtype=np.int32)
+        categories_per_embed_col = X['dataset_properties']['num_categories_per_col']
 
-        categories = X['dataset_properties']['categories']
-        for idx, cats in enumerate(categories, start=num_numerical_columns):
-            num_input_feats[idx] = len(cats)
+        # only fill num categories for embedding columns
+        for idx, cats in enumerate(categories_per_embed_col, start=num_features_excl_embed):
+            num_categories_per_col[idx] = cats
 
-        return num_numerical_columns, num_input_feats
+        return num_features_excl_embed, num_categories_per_col
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py
index 7d26c5481..d53298665 100644
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py
@@ -4,10 +4,13 @@
 from abc import abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+from ConfigSpace.configuration_space import Configuration
+
 import numpy as np
 
 import pandas as pd
 
+from sklearn.base import BaseEstimator
 from sklearn.utils import check_random_state
 
 import torch
@@ -84,9 +87,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
                                       logger_port=X['logger_port'] if 'logger_port' in X else
                                       logging.handlers.DEFAULT_TCP_LOGGING_PORT,
                                       output_shape=output_shape,
+                                      dataset_properties=X['dataset_properties'],
                                       task_type=X['dataset_properties']['task_type'],
                                       output_type=X['dataset_properties']['output_type'],
-                                      optimize_metric=X['optimize_metric'] if 'optimize_metric' in X else None)
+                                      optimize_metric=X['optimize_metric'] if 'optimize_metric' in X else None,
+                                      time_limit=X['func_eval_time_limit_secs'])
 
         # train model
         blockPrint()
@@ -102,6 +107,30 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
             self.fit_output["test_preds"] = test_preds
         return self
 
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> BaseEstimator:
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            An instance of self
+        """
+        params = configuration.get_dictionary()
+
+        setattr(self, 'config', params)
+
+        return self
+
     @abstractmethod
     def build_model(
         self,
@@ -110,6 +139,7 @@ def build_model(
         logger_port: int,
         task_type: str,
         output_type: str,
+        time_limit: Optional[int] = None,
         optimize_metric: Optional[str] = None
     ) -> BaseTraditionalLearner:
         """
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
deleted file mode 100644
index c65a311fe..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/catboost.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-	"iterations" : 10000,
-	"learning_rate" : 0.1
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
deleted file mode 100644
index 81f1d6383..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/extra_trees.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-	"n_estimators" : 300
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
deleted file mode 100644
index 0fa7f95d4..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/knn.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-	"weights" : "uniform"
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
deleted file mode 100644
index d8e061f5e..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/lgb.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-	"num_rounds" : 10000,
-	"num_leaves" : 128,
-	"two_round" : "True",
-	"min_data_in_leaf" : 3,
-	"feature_fraction" : 0.9,
-	"boosting_type" : "gbdt",
-	"learning_rate" : 0.03
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
deleted file mode 100644
index 81f1d6383..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/random_forest.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-	"n_estimators" : 300
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
deleted file mode 100644
index 2c63c0851..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/rotation_forest.json
+++ /dev/null
@@ -1,2 +0,0 @@
-{
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json b/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
deleted file mode 100644
index e5f3c5622..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs/svm.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-	"C" : 1.0,
-	"degree" : 3
-}
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py
index 588fb83ed..b20427e77 100644
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/tabular_traditional_model.py
@@ -1,4 +1,5 @@
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
+import re
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -11,8 +12,10 @@
 from autoPyTorch.pipeline.components.setup.traditional_ml.base_model import BaseModelComponent
 from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import (
     BaseTraditionalLearner, get_available_traditional_learners)
+from autoPyTorch.utils.common import HyperparameterSearchSpace
 
 
+# TODO: Make this a choice and individual components for each traditional classifier
 class TabularTraditionalModel(BaseModelComponent):
     """
     Implementation of a dynamic model, that consists of a learner and a head
@@ -38,25 +41,66 @@ def get_properties(
             "name": "Tabular Traditional Model",
         }
 
-    @staticmethod
-    def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    
+    def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                                         **kwargs: Any) -> ConfigurationSpace:
         cs = ConfigurationSpace()
-        traditional_learners: Dict[str, Type[BaseTraditionalLearner]] = get_available_traditional_learners()
+        available_traditional_learners: Dict[str, Type[BaseTraditionalLearner]] = get_available_traditional_learners()
         # Remove knn if data is all categorical
 
         if dataset_properties is not None:
             numerical_columns = dataset_properties['numerical_columns'] \
                 if isinstance(dataset_properties['numerical_columns'], List) else []
             if len(numerical_columns) == 0:
-                del traditional_learners['knn']
-        learner_hp = CategoricalHyperparameter("traditional_learner", choices=traditional_learners.keys())
+                del available_traditional_learners['knn']
+
+        updates = self._get_search_space_updates()
+
+        if 'traditional_learner' in updates:
+            learner_hp = CategoricalHyperparameter("traditional_learner", choices=updates['traditional_learner'].value_range)
+        else:
+            learner_hp = CategoricalHyperparameter("traditional_learner", choices=available_traditional_learners.keys())
         cs.add_hyperparameters([learner_hp])
 
+        for name in learner_hp.choices:
+            child_updates = self._get_child_search_space_updates(prefix=name)
+            model_configuration_space = available_traditional_learners[name]. \
+                get_hyperparameter_search_space(dataset_properties, **child_updates)
+            parent_hyperparameter = {'parent': learner_hp, 'value': name}
+            cs.add_configuration_space(
+                name,
+                model_configuration_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
         return cs
 
+    def _get_child_search_space_updates(self, prefix: Optional[str] = None) -> Dict[str, HyperparameterSearchSpace]:
+        """Get the search space updates with the given prefix
+
+        Args:
+            prefix (str):
+                Only return search space updates with given prefix (default: {None})
+
+        Returns:
+            Dict[str, HyperparameterSearchSpace]:
+                Mapping of search space updates. Keys don't contain the prefix.
+        """
+
+        result: Dict[str, HyperparameterSearchSpace] = dict()
+
+        # iterate over all search space updates of this node and keep the ones that have the given prefix
+        for key in self._cs_updates.keys():
+            if prefix is None:
+                result[key] = self._cs_updates[key].get_search_space()
+            elif re.search(f'^{prefix}', key) is not None:
+                result[key[len(prefix) + 1:]] = self._cs_updates[key].get_search_space(remove_prefix=prefix)
+        return result
+
     def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
-                    logger_port: int, task_type: str, output_type: str, optimize_metric: Optional[str] = None
+                    dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                    logger_port: int, task_type: str, output_type: str, optimize_metric: Optional[str] = None,
+                    time_limit: Optional[int] = None,
                     ) -> BaseTraditionalLearner:
         """
         This method returns a traditional learner, that is dynamically
@@ -64,14 +108,19 @@ def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...
         the additional configuration hyperparameters to build a domain
         specific model
         """
-        learner_name = self.config["traditional_learner"]
+        learner_name = self.config.pop("traditional_learner")
         Learner = self._traditional_learners[learner_name]
 
+        config = self._remove_prefix_config(learner_name=learner_name)
         learner = Learner(random_state=self.random_state, logger_port=logger_port,
-                          task_type=task_type, output_type=output_type, optimize_metric=optimize_metric)
+                          task_type=task_type, output_type=output_type, optimize_metric=optimize_metric,
+                          dataset_properties=dataset_properties, time_limit=time_limit, **config)
 
         return learner
 
+    def _remove_prefix_config(self, learner_name):
+        return {key.replace(f'{learner_name}:', ''): value for key, value in self.config.items()}
+
     def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         return f"TabularTraditionalModel: {self.model.name if self.model is not None else None}"
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py
index f4a7b98de..34e71bf05 100644
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/__init__.py
@@ -1,17 +1,17 @@
-from typing import Any, Dict, Type, Union
+from typing import Any, Dict, Optional, Type, Union
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
 )
 from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
     BaseTraditionalLearner
-from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.learners import (
-    CatboostModel,
-    ExtraTreesModel,
-    KNNModel,
-    LGBModel,
-    RFModel,
-    SVMModel)
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.lgbm.lgbm import LGBModel
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.catboost.catboost import CatboostModel
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.random_forest.random_forest import RFModel
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.extratrees.extratrees import ExtraTreesModel
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.knn.knn import KNNModel
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.xgboost import XGBModel
 
 _traditional_learners = {
     # Sort by more robust models
@@ -28,8 +28,8 @@
     'catboost': CatboostModel,
     'random_forest': RFModel,
     'extra_trees': ExtraTreesModel,
-    'svm': SVMModel,
     'knn': KNNModel,
+    'xgboost': XGBModel
 }
 _addons = ThirdPartyComponents(BaseTraditionalLearner)
 
@@ -38,7 +38,14 @@ def add_traditional_learner(traditional_learner: BaseTraditionalLearner) -> None
     _addons.add_component(traditional_learner)
 
 
-def get_available_traditional_learners() -> Dict[str, Union[Type[BaseTraditionalLearner], Any]]:
+def get_available_traditional_learners(
+    dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+) -> Dict[str, Union[Type[BaseTraditionalLearner], Any]]:
     traditional_learners = dict()
     traditional_learners.update(_traditional_learners)
+    traditional_learners.update(_addons.components)
+
+    if dataset_properties is not None and len(dataset_properties['numerical_columns']) ==0:
+        traditional_learners.pop('knn', None)
+
     return traditional_learners
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py
index 9c0166a9f..a9b306475 100644
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/base_traditional_learner.py
@@ -2,7 +2,7 @@
 import logging.handlers
 import os as os
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from catboost import CatBoost
 
@@ -13,7 +13,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_random_state
 
-from autoPyTorch.constants import REGRESSION_TASKS, STRING_TO_TASK_TYPES
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.constants import REGRESSION_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.utils.logging_ import get_named_client_logger
 
@@ -42,9 +43,12 @@ class BaseTraditionalLearner:
     def __init__(self,
                  task_type: str,
                  output_type: str,
+                 params_func: Optional[Callable],
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                  optimize_metric: Optional[str] = None,
                  logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
                  random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
                  name: Optional[str] = None):
 
         self.model: Optional[Union[CatBoost, BaseEstimator]] = None
@@ -61,13 +65,15 @@ def __init__(self,
             self.random_state = check_random_state(1)
         else:
             self.random_state = check_random_state(random_state)
-        self.config = self.get_config()
+
+        self.output_type = STRING_TO_OUTPUT_TYPES[output_type]
+        self.config = params_func(self.output_type)
 
         self.all_nan: Optional[np.ndarray] = None
         self.num_classes: Optional[int] = None
-
+        self.time_limit = time_limit
         self.is_classification = STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS
-
+        self.dataset_properties = dataset_properties
         self.metric = get_metrics(dataset_properties={'task_type': task_type,
                                                       'output_type': output_type},
                                   names=[optimize_metric] if optimize_metric is not None else None)[0]
@@ -76,16 +82,7 @@ def get_config(self) -> Dict[str, Union[int, str, float, bool]]:
         """
         Load the parameters for the classifier model from ../estimator_configs/modelname.json.
         """
-        dirname = os.path.dirname(os.path.abspath(__file__))
-        config_path = os.path.join(dirname, "../estimator_configs", self.name + ".json")
-        with open(config_path, "r") as f:
-            config: Dict[str, Union[int, str, float, bool]] = json.load(f)
-        for k, v in config.items():
-            if v == "True":
-                config[k] = True
-            if v == "False":
-                config[k] = False
-        return config
+        return self.config
 
     def _preprocess(self,
                     X: np.ndarray
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py
new file mode 100644
index 000000000..c5f81fb7b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/catboost.py
@@ -0,0 +1,142 @@
+import logging.handlers
+import tempfile
+from typing import Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter,
+    UniformFloatHyperparameter
+)
+
+import numpy as np
+
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.catboost.utils import (
+    AutoPyTorchToCatboostMetrics,
+    EarlyStoppingCallback,
+    MemoryCheckCallback,
+    get_params
+)
+
+from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+from autoPyTorch.utils.early_stopping import get_early_stopping_rounds
+
+
+class CatboostModel(BaseTraditionalLearner):
+
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(CatboostModel, self).__init__(name="catboost",
+                                            logger_port=logger_port,
+                                            random_state=random_state,
+                                            task_type=task_type,
+                                            output_type=output_type,
+                                            optimize_metric=optimize_metric,
+                                            dataset_properties=dataset_properties,
+                                            time_limit=time_limit,
+                                            params_func=get_params)
+        self.config["train_dir"] = tempfile.gettempdir()
+        self.config.update(kwargs)
+
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+        if not self.is_classification:
+            self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value
+            # CatBoost Cannot handle a random state object, just the seed
+            self.model = CatBoostRegressor(**self.config, random_state=self.random_state.get_state()[1][0])
+        else:
+            self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value
+            # CatBoost Cannot handle a random state object, just the seed
+            self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0])
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray) -> None:
+
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+        early_stopping = get_early_stopping_rounds(num_rows_train=X_train.shape[0])
+        callbacks = []
+        callbacks.append(EarlyStoppingCallback(stopping_rounds=early_stopping, eval_metric=self.config['eval_metric']))
+        num_rows_train = X_train.shape[0]
+        num_cols_train = X_train.shape[1]
+        self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1
+        if num_rows_train * num_cols_train * self.num_classes > 5_000_000:
+            # The data is large enough to potentially cause memory issues during training, so monitor memory usage via callback.
+            callbacks.append(MemoryCheckCallback())
+        categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)]
+
+        X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals)
+        X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals)
+
+        self.model.fit(X_train_pooled,
+                       eval_set=X_val_pooled,
+                       use_best_model=True,
+                       early_stopping_rounds=early_stopping,
+                       callbacks=callbacks,
+                       verbose=False)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='learning_rate',
+            value_range=(5e-3, 0.2),
+            default_value=0.05,
+            log=True
+        ),
+        depth: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='depth',
+            value_range=(5, 8),
+            default_value=6,
+        ),
+        l2_leaf_reg: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='l2_leaf_reg',
+            value_range=(1, 5),
+            default_value=3,
+        ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, l2_leaf_reg, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, depth, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'CBLearner',
+            'name': 'Categorical Boosting Learner',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py
new file mode 100644
index 000000000..ffac75e6c
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/catboost/utils.py
@@ -0,0 +1,138 @@
+from typing import Any, Dict
+import logging
+import time
+import psutil
+from enum import Enum
+
+
+class AutoPyTorchToCatboostMetrics(Enum):
+    mean_absolute_error = "MAE"
+    root_mean_squared_error = "RMSE"
+    mean_squared_log_error = "MSLE"
+    r2 = "R2"
+    accuracy = "Accuracy"
+    balanced_accuracy = "BalancedAccuracy"
+    f1 = "F1"
+    roc_auc = "AUC"
+    precision = "Precision"
+    recall = "Recall"
+    log_loss = "Logloss"
+
+
+class MemoryCheckCallback:
+    """
+    Callback to ensure memory usage is safe, otherwise early stops the model to avoid OOM errors.
+
+    This callback is CatBoost specific.
+
+    Args:
+
+        period : int, default = 10
+            Number of iterations between checking memory status. Higher values are less precise but use less compute.
+        verbose : bool, default = False
+            Whether to log information on memory status even if memory usage is low.
+    """
+    def __init__(self, period: int = 10, verbose=False):
+        self.period = period
+        self.mem_status = psutil.Process()
+        self.init_mem_rss = self.mem_status.memory_info().rss
+        self.init_mem_avail = psutil.virtual_memory().available
+        self.verbose = verbose
+
+        self._cur_period = 1
+
+    def after_iteration(self, info):
+        iteration = info.iteration
+        if iteration % self._cur_period == 0:
+            not_enough_memory = self.memory_check(iteration)
+            if not_enough_memory:
+                return False
+        return True
+
+    def memory_check(self, iter) -> bool:
+        """Checks if memory usage is unsafe. If so, then returns True to signal the model to stop training early."""
+        available_bytes = psutil.virtual_memory().available
+        cur_rss = self.mem_status.memory_info().rss
+
+        if cur_rss < self.init_mem_rss:
+            self.init_mem_rss = cur_rss
+        estimated_model_size_mb = (cur_rss - self.init_mem_rss) >> 20
+        available_mb = available_bytes >> 20
+        model_size_memory_ratio = estimated_model_size_mb / available_mb
+
+        early_stop = False
+        if model_size_memory_ratio > 1.0:
+            early_stop = True
+
+        if available_mb < 512:  # Less than 500 MB
+            early_stop = True
+
+        if early_stop:
+            return True
+        elif self.verbose or (model_size_memory_ratio > 0.25):
+
+            if model_size_memory_ratio > 0.5:
+                self._cur_period = 1  # Increase rate of memory check if model gets large enough to cause OOM potentially
+            elif iter > self.period:
+                self._cur_period = self.period
+
+        return False
+
+
+class EarlyStoppingCallback:
+    """
+    Early stopping callback.
+
+    This callback is CatBoost specific.
+
+    Args:
+        stopping_rounds : int or tuple
+            If int, The possible number of rounds without the trend occurrence.
+            If tuple, contains early stopping class as first element and class init kwargs as second element.
+        eval_metric : str
+            The eval_metric to use for early stopping. Must also be specified in the CatBoost model params.
+        compare_key : str, default = 'validation'
+            The data to use for scoring. It is recommended to keep as default.
+    """
+    def __init__(self, stopping_rounds, eval_metric, compare_key='validation'):
+        if isinstance(stopping_rounds, int):
+            from autoPyTorch.utils.early_stopping import SimpleEarlyStopper
+            self.es = SimpleEarlyStopper(patience=stopping_rounds)
+        else:
+            self.es = stopping_rounds[0](**stopping_rounds[1])
+        self.best_score = None
+        self.compare_key = compare_key
+
+        if isinstance(eval_metric, str):
+            from catboost._catboost import is_maximizable_metric
+            is_max_optimal = is_maximizable_metric(eval_metric)
+            eval_metric_name = eval_metric
+        else:
+            is_max_optimal = eval_metric.is_max_optimal()
+
+            eval_metric_name = eval_metric.__class__.__name__
+
+        self.eval_metric_name = eval_metric_name
+        self.is_max_optimal = is_max_optimal
+
+    def after_iteration(self, info):
+        is_best_iter = False
+        cur_score = info.metrics[self.compare_key][self.eval_metric_name][-1]
+        if not self.is_max_optimal:
+            cur_score *= -1
+        if self.best_score is None:
+            self.best_score = cur_score
+        elif cur_score > self.best_score:
+            is_best_iter = True
+            self.best_score = cur_score
+
+        should_stop = self.es.update(current_epoch=info.iteration, is_best=is_best_iter)
+        return not should_stop
+
+
+def get_params(output_type: int) -> Dict[str, Any]:
+
+	return {
+	"iterations" : 10000,
+	"learning_rate" : 0.1
+}
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py
new file mode 100644
index 000000000..f237ed61c
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/extratrees.py
@@ -0,0 +1,99 @@
+import logging.handlers
+from typing import Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from sklearn.ensemble import (
+    ExtraTreesClassifier,
+    ExtraTreesRegressor,
+)
+
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.extratrees.utils import get_params
+
+
+class ExtraTreesModel(BaseTraditionalLearner):
+
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(ExtraTreesModel, self).__init__(name="extra_trees",
+                                              logger_port=logger_port,
+                                              random_state=random_state,
+                                              task_type=task_type,
+                                              output_type=output_type,
+                                              optimize_metric=optimize_metric,
+                                              dataset_properties=dataset_properties,
+                                              time_limit=time_limit,
+                                              params_func=get_params)
+                                              
+                                              
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+        self.config["warm_start"] = False
+
+        if not self.is_classification:
+            self.model = ExtraTreesRegressor(**self.config, random_state=self.random_state)
+        else:
+            self.num_classes = len(np.unique(y_train))
+            if self.num_classes > 2:
+                self.logger.info("==> Using warmstarting for multiclass")
+                self.final_n_estimators = self.config["n_estimators"]
+                self.config["n_estimators"] = 8
+                self.config["warm_start"] = True
+
+            self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state)
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray) -> None:
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+        self.model.fit(X_train, y_train)
+        if self.config["warm_start"]:
+            self.model.n_estimators = self.final_n_estimators
+            self.model.fit(X_train, y_train)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ETLearner',
+            'name': 'ExtraTreesLearner',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py
new file mode 100644
index 000000000..e480dfed8
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/extratrees/utils.py
@@ -0,0 +1,7 @@
+from typing import Any, Dict
+
+
+def get_params(output_type: int) -> Dict[str, Any]:
+	return {
+        "n_estimators" : 300
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py
new file mode 100644
index 000000000..9e20ccbd6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/knn.py
@@ -0,0 +1,108 @@
+import logging.handlers
+from typing import Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+
+
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.knn.utils import get_params as knn_get_params
+
+
+class KNNModel(BaseTraditionalLearner):
+
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(KNNModel, self).__init__(name="knn",
+                                       logger_port=logger_port,
+                                       random_state=random_state,
+                                       task_type=task_type,
+                                       output_type=output_type,
+                                       optimize_metric=optimize_metric,
+                                       dataset_properties=dataset_properties,
+                                       time_limit=time_limit,
+                                       params_func=knn_get_params)
+        self.categoricals: Optional[np.ndarray[bool]] = None
+        self.config.update(kwargs)
+
+    def _preprocess(self,
+                    X: np.ndarray
+                    ) -> np.ndarray:
+
+        super(KNNModel, self)._preprocess(X)
+        if self.categoricals is None:
+            self.categoricals = np.array([isinstance(X[0, ind], str) for ind in range(X.shape[1])])
+        X = X[:, ~self.categoricals] if self.categoricals is not None else X
+
+        return X
+
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+        try:
+            # TODO: Add more granular switch, currently this affects all future KNN models even if they had `use_daal=False`
+            from sklearnex import patch_sklearn
+            patch_sklearn("knn_classifier")
+            patch_sklearn("knn_regressor")
+            # sklearnex backend for KNN seems to be 20-40x+ faster than native sklearn with no downsides.
+            self.logger.log(15, '\tUsing sklearnex KNN backend...')
+        except:
+            pass
+        if not self.is_classification:
+            self.model = KNeighborsRegressor(**self.config)
+        else:
+            self.num_classes = len(np.unique(y_train))
+            # KNN is deterministic, no random seed needed
+            self.model = KNeighborsClassifier(**self.config)
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray) -> None:
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+        self.model.fit(X_train, y_train)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'KNNLearner',
+            'name': 'K Nearest Neighbors Learner',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py
new file mode 100644
index 000000000..61ca3ca8f
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/knn/utils.py
@@ -0,0 +1,8 @@
+from typing import Any, Dict
+
+
+def get_params(output_type: int) -> Dict[str, Any]:
+
+	return dict(
+        weights="uniform"
+    )
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py
deleted file mode 100644
index 220c52dcd..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/learners.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import logging.handlers
-import tempfile
-from typing import Dict, Optional, Union
-
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
-
-from lightgbm import LGBMClassifier, LGBMRegressor
-
-import numpy as np
-
-from sklearn.ensemble import (
-    ExtraTreesClassifier,
-    ExtraTreesRegressor,
-    RandomForestClassifier,
-    RandomForestRegressor
-)
-from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
-from sklearn.svm import SVC, SVR
-
-from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
-    BaseTraditionalLearner
-from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.utils import (
-    AutoPyTorchToCatboostMetrics
-)
-
-
-class LGBModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(LGBModel, self).__init__(name="lgb",
-                                       logger_port=logger_port,
-                                       random_state=random_state,
-                                       task_type=task_type,
-                                       output_type=output_type,
-                                       optimize_metric=optimize_metric)
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-        early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10)
-        self.config["early_stopping_rounds"] = early_stopping
-        if not self.is_classification:
-            self.model = LGBMRegressor(**self.config, random_state=self.random_state)
-        else:
-            self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1  # this fixes a bug
-            self.config["num_class"] = self.num_classes
-
-            self.model = LGBMClassifier(**self.config, random_state=self.random_state)
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray
-             ) -> None:
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-        self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
-
-    def predict(self, X_test: np.ndarray,
-                predict_proba: bool = False,
-                preprocess: bool = True) -> np.ndarray:
-        assert self.model is not None, "No model found. Can't " \
-                                       "predict before fitting. " \
-                                       "Call fit before predicting"
-        if preprocess:
-            X_test = self._preprocess(X_test)
-
-        if predict_proba:
-            if not self.is_classification:
-                raise ValueError("Can't predict probabilities for a regressor")
-            y_pred_proba = self.model.predict_proba(X_test)
-            if self.num_classes == 2:
-                y_pred_proba = y_pred_proba.transpose()[0:len(X_test)]
-            return y_pred_proba
-
-        y_pred = self.model.predict(X_test)
-        return y_pred
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'LGBMLearner',
-            'name': 'Light Gradient Boosting Machine Learner',
-        }
-
-
-class CatboostModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(CatboostModel, self).__init__(name="catboost",
-                                            logger_port=logger_port,
-                                            random_state=random_state,
-                                            task_type=task_type,
-                                            output_type=output_type,
-                                            optimize_metric=optimize_metric)
-        self.config["train_dir"] = tempfile.gettempdir()
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-        if not self.is_classification:
-            self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value
-            # CatBoost Cannot handle a random state object, just the seed
-            self.model = CatBoostRegressor(**self.config, random_state=self.random_state.get_state()[1][0])
-        else:
-            self.config['eval_metric'] = AutoPyTorchToCatboostMetrics[self.metric.name].value
-            # CatBoost Cannot handle a random state object, just the seed
-            self.model = CatBoostClassifier(**self.config, random_state=self.random_state.get_state()[1][0])
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray) -> None:
-
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-        early_stopping = 150 if X_train.shape[0] > 10000 else max(round(150 * 10000 / X_train.shape[0]), 10)
-        categoricals = [ind for ind in range(X_train.shape[1]) if isinstance(X_train[0, ind], str)]
-
-        X_train_pooled = Pool(data=X_train, label=y_train, cat_features=categoricals)
-        X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals)
-
-        self.model.fit(X_train_pooled,
-                       eval_set=X_val_pooled,
-                       use_best_model=True,
-                       early_stopping_rounds=early_stopping,
-                       verbose=False)
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'CBLearner',
-            'name': 'Categorical Boosting Learner',
-        }
-
-
-class RFModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(RFModel, self).__init__(name="random_forest",
-                                      logger_port=logger_port,
-                                      random_state=random_state,
-                                      task_type=task_type,
-                                      output_type=output_type,
-                                      optimize_metric=optimize_metric)
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-
-        self.config["warm_start"] = False
-        # TODO: Check if we need to warmstart for regression.
-        #  In autogluon, they warm start when usinf daal backend, see
-        #  ('https://github.com/awslabs/autogluon/blob/master/tabular/src/autogluon/tabular/models/rf/rf_model.py#L35')
-        if not self.is_classification:
-            self.model = RandomForestRegressor(**self.config, random_state=self.random_state)
-        else:
-            self.num_classes = len(np.unique(y_train))
-            if self.num_classes > 2:
-                self.logger.info("==> Using warmstarting for multiclass")
-                self.final_n_estimators = self.config["n_estimators"]
-                self.config["n_estimators"] = 8
-                self.config["warm_start"] = True
-            self.model = RandomForestClassifier(**self.config, random_state=self.random_state)
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray) -> None:
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-
-        self.model.fit(X_train, y_train)
-        if self.config["warm_start"]:
-            self.model.n_estimators = self.final_n_estimators
-            self.model.fit(X_train, y_train)
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'RFLearner',
-            'name': 'Random Forest Learner',
-        }
-
-
-class ExtraTreesModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(ExtraTreesModel, self).__init__(name="extra_trees",
-                                              logger_port=logger_port,
-                                              random_state=random_state,
-                                              task_type=task_type,
-                                              output_type=output_type,
-                                              optimize_metric=optimize_metric)
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-        self.config["warm_start"] = False
-
-        if not self.is_classification:
-            self.model = ExtraTreesRegressor(**self.config, random_state=self.random_state)
-        else:
-            self.num_classes = len(np.unique(y_train))
-            if self.num_classes > 2:
-                self.logger.info("==> Using warmstarting for multiclass")
-                self.final_n_estimators = self.config["n_estimators"]
-                self.config["n_estimators"] = 8
-                self.config["warm_start"] = True
-
-            self.model = ExtraTreesClassifier(**self.config, random_state=self.random_state)
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray) -> None:
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-        self.model.fit(X_train, y_train)
-        if self.config["warm_start"]:
-            self.model.n_estimators = self.final_n_estimators
-            self.model.fit(X_train, y_train)
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'ETLearner',
-            'name': 'ExtraTreesLearner',
-        }
-
-
-class KNNModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(KNNModel, self).__init__(name="knn",
-                                       logger_port=logger_port,
-                                       random_state=random_state,
-                                       task_type=task_type,
-                                       output_type=output_type,
-                                       optimize_metric=optimize_metric)
-        self.categoricals: Optional[np.ndarray[bool]] = None
-
-    def _preprocess(self,
-                    X: np.ndarray
-                    ) -> np.ndarray:
-
-        super(KNNModel, self)._preprocess(X)
-        if self.categoricals is None:
-            self.categoricals = np.array([isinstance(X[0, ind], str) for ind in range(X.shape[1])])
-        X = X[:, ~self.categoricals] if self.categoricals is not None else X
-
-        return X
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-        if not self.is_classification:
-            self.model = KNeighborsRegressor(**self.config)
-        else:
-            self.num_classes = len(np.unique(y_train))
-            # KNN is deterministic, no random seed needed
-            self.model = KNeighborsClassifier(**self.config)
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray) -> None:
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-        self.model.fit(X_train, y_train)
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'KNNLearner',
-            'name': 'K Nearest Neighbors Learner',
-        }
-
-
-class SVMModel(BaseTraditionalLearner):
-
-    def __init__(self,
-                 task_type: str,
-                 output_type: str,
-                 optimize_metric: Optional[str] = None,
-                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
-                 random_state: Optional[np.random.RandomState] = None
-                 ):
-        super(SVMModel, self).__init__(name="svm",
-                                       logger_port=logger_port,
-                                       random_state=random_state,
-                                       task_type=task_type,
-                                       output_type=output_type,
-                                       optimize_metric=optimize_metric)
-
-    def _prepare_model(self,
-                       X_train: np.ndarray,
-                       y_train: np.ndarray
-                       ) -> None:
-        if not self.is_classification:
-            # Does not take random state.
-            self.model = SVR(**self.config)
-        else:
-            self.model = SVC(**self.config, probability=True, random_state=self.random_state)
-
-    def _fit(self, X_train: np.ndarray,
-             y_train: np.ndarray,
-             X_val: np.ndarray,
-             y_val: np.ndarray) -> None:
-        assert self.model is not None, "No model found. Can't fit without preparing the model"
-        self.model.fit(X_train, y_train)
-
-    @staticmethod
-    def get_properties(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'SVMLearner',
-            'name': 'Support Vector Machine Learner',
-        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py
new file mode 100644
index 000000000..644c9fde6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/lgbm.py
@@ -0,0 +1,153 @@
+import logging.handlers
+from time import time
+from typing import Dict, Optional, Union
+
+import logging
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter,
+    UniformFloatHyperparameter
+)
+
+from lightgbm import LGBMClassifier, LGBMRegressor
+
+import numpy as np
+
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.lgbm.utils import early_stopping_custom, get_metric, get_params as lgb_get_params, get_train_loss_name
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.early_stopping import get_early_stopping_rounds
+
+
+
+class LGBModel(BaseTraditionalLearner):
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(LGBModel, self).__init__(name="lgb",
+                                       logger_port=logger_port,
+                                       random_state=random_state,
+                                       task_type=task_type,
+                                       output_type=output_type,
+                                       optimize_metric=optimize_metric,
+                                       dataset_properties=dataset_properties,
+                                       time_limit=time_limit,
+                                       params_func=lgb_get_params)
+        self.config.update(kwargs)
+
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+        early_stopping = get_early_stopping_rounds(X_train.shape[0])
+        self.config["early_stopping_rounds"] = early_stopping
+        self.stopping_metric_name = get_metric(output_type=self.output_type, optimize_metric=self.metric.name)
+        self.training_objective = get_train_loss_name(self.output_type)
+        if not self.is_classification:
+            self.model = LGBMRegressor(**self.config, random_state=self.random_state)
+        else:
+            self.num_classes = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1  # this fixes a bug
+            self.config["num_class"] = self.num_classes
+
+            self.model = LGBMClassifier(**self.config, random_state=self.random_state)
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray
+             ) -> None:
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+        start_time = time()
+        callbacks = [
+            # TODO: pass start time and time limit to early stopping
+            early_stopping_custom(self.config["early_stopping_rounds"], logger=self.logger, metrics_to_use=[('valid_set', self.stopping_metric_name)], max_diff=None, start_time=start_time, time_limit=self.time_limit,
+                                  ignore_dart_warning=True, verbose=False, manual_stop_file=False, train_loss_name=self.training_objective),
+        ]
+        self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.training_objective, callbacks=callbacks)
+
+    def predict(self, X_test: np.ndarray,
+                predict_proba: bool = False,
+                preprocess: bool = True) -> np.ndarray:
+        assert self.model is not None, "No model found. Can't " \
+                                       "predict before fitting. " \
+                                       "Call fit before predicting"
+        if preprocess:
+            X_test = self._preprocess(X_test)
+
+        if predict_proba:
+            if not self.is_classification:
+                raise ValueError("Can't predict probabilities for a regressor")
+            y_pred_proba = self.model.predict_proba(X_test)
+            if self.num_classes == 2:
+                y_pred_proba = y_pred_proba.transpose()[0:len(X_test)]
+            return y_pred_proba
+
+        y_pred = self.model.predict(X_test)
+        return y_pred
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='learning_rate',
+            value_range=(5e-3, 0.2),
+            default_value=0.05,
+            log=True
+        ),
+        feature_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='feature_fraction',
+            value_range=(0.75, 1),
+            default_value=1,
+        ),
+        min_data_in_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='min_data_in_leaf',
+            value_range=(2, 60),
+            default_value=20,
+        ),
+        num_leaves: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='num_leaves',
+            value_range=(16, 96),
+            default_value=31,
+        ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, num_leaves, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, min_data_in_leaf, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, feature_fraction, UniformFloatHyperparameter)
+        add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'LGBMLearner',
+            'name': 'Light Gradient Boosting Machine Learner',
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py
new file mode 100644
index 000000000..466031433
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/lgbm/utils.py
@@ -0,0 +1,298 @@
+from typing import Any, Dict
+from autoPyTorch.constants import (
+	MULTICLASS,
+	BINARY,
+	CONTINUOUS,
+	OUTPUT_TYPES_TO_STRING
+)
+import logging.handlers
+from typing import Dict, Optional, Union
+
+import copy
+import logging
+import os
+import psutil
+import time
+import warnings
+from operator import gt, lt
+
+from lightgbm.callback import _format_eval_result, EarlyStopException
+from autoPyTorch.utils.early_stopping import SimpleEarlyStopper
+
+
+DEFAULT_METRIC_INDEX = 0
+
+
+def get_common_params():
+	return {
+		"num_rounds": 10000,
+		"num_leaves": 128,
+		"feature_fraction": 0.9,
+		"boosting_type": "gbdt",
+	}
+
+
+def get_params_binary():
+	return {
+		"min_data_in_leaf": 3,
+		"learning_rate": 0.03
+	}
+
+def get_params_multiclass():
+	return {
+		"min_data_in_leaf": 3,
+		"learning_rate": 0.03
+	}
+
+def get_params_continuous():
+	return {
+		"min_data_in_leaf": 3,
+		"learning_rate": 0.03
+	}
+
+def get_params(output_type: int) -> Dict[str, Any]:
+
+	common_params = get_common_params()
+	if output_type == BINARY:
+		common_params.update(get_params_binary())
+	elif output_type == MULTICLASS:
+		common_params.update(get_params_multiclass())
+	elif output_type == CONTINUOUS:
+		common_params.update(get_params_continuous())
+	else:
+		raise ValueError(f"Unknown output_type: {OUTPUT_TYPES_TO_STRING[output_type]}")
+	return common_params
+
+
+def early_stopping_custom(stopping_rounds, logger, first_metric_only=False, metrics_to_use=None, start_time=None, time_limit=None, verbose=True, max_diff=None, ignore_dart_warning=False, manual_stop_file=None, train_loss_name=None, reporter=None):
+    """Create a callback that activates early stopping.
+    Note:
+        Implementation from autogluon
+    Note
+    ----
+    Activates early stopping.
+    The model will train until the validation score stops improving.
+    Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
+    to continue training.
+    Requires at least one validation data and one metric.
+    If there's more than one, will check all of them. But the training data is ignored anyway.
+    To check only the first metric set ``first_metric_only`` to True.
+    Parameters
+    ----------
+    stopping_rounds : int or tuple
+       If int, The possible number of rounds without the trend occurrence.
+       If tuple, contains early stopping class as first element and class init kwargs as second element.
+    first_metric_only : bool, optional (default=False)
+       Whether to use only the first metric for early stopping.
+    verbose : bool, optional (default=True)
+        Whether to print message with early stopping information.
+    train_loss_name : str, optional (default=None):
+        Name of metric that contains training loss value.
+    reporter : optional (default=None):
+        reporter object from AutoGluon scheduler.
+    Returns
+    -------
+    callback : function
+        The callback that activates early stopping.
+    """
+    best_score = []
+    best_iter = []
+    best_score_list = []
+    best_trainloss = []  # stores training losses at corresponding best_iter
+    cmp_op = []
+    enabled = [True]
+    indices_to_check = []
+    mem_status = psutil.Process()
+    init_mem_rss = []
+    init_mem_avail = []
+    es = []
+
+    def _init(env):
+        if not ignore_dart_warning:
+            enabled[0] = not any((boost_alias in env.params
+                                  and env.params[boost_alias] == 'dart') for boost_alias in ('boosting',
+                                                                                             'boosting_type',
+                                                                                             'boost'))
+        if not enabled[0]:
+            warnings.warn('Early stopping is not available in dart mode')
+            return
+        if not env.evaluation_result_list:
+            raise ValueError('For early stopping, '
+                             'at least one dataset and eval metric is required for evaluation')
+
+        if verbose:
+            msg = "Training until validation scores don't improve for {} rounds."
+            logger.debug(msg.format(stopping_rounds))
+            if manual_stop_file:
+                logger.debug('Manually stop training by creating file at location: ', manual_stop_file)
+
+        if isinstance(stopping_rounds, int):
+            es_template = SimpleEarlyStopper(patience=stopping_rounds)
+        else:
+            es_template = stopping_rounds[0](**stopping_rounds[1])
+
+        for eval_ret in env.evaluation_result_list:
+            best_iter.append(0)
+            best_score_list.append(None)
+            best_trainloss.append(None)
+            es.append(copy.deepcopy(es_template))
+            if eval_ret[3]:
+                best_score.append(float('-inf'))
+                cmp_op.append(gt)
+            else:
+                best_score.append(float('inf'))
+                cmp_op.append(lt)
+
+        if metrics_to_use is None:
+            for i in range(len(env.evaluation_result_list)):
+                indices_to_check.append(i)
+                if first_metric_only:
+                    break
+        else:
+            for i, eval in enumerate(env.evaluation_result_list):
+                if (eval[0], eval[1]) in metrics_to_use:
+                    indices_to_check.append(i)
+                    if first_metric_only:
+                        break
+
+        init_mem_rss.append(mem_status.memory_info().rss)
+        init_mem_avail.append(psutil.virtual_memory().available)
+
+    def _callback(env):
+        if not cmp_op:
+            _init(env)
+        if not enabled[0]:
+            return
+        train_loss_val = 0.0
+        for i in indices_to_check:
+            is_best_iter = False
+            eval_result = env.evaluation_result_list[i]
+            _, eval_metric, score, greater_is_better = eval_result
+            if best_score_list[i] is None or cmp_op[i](score, best_score[i]):
+                is_best_iter = True
+                best_score[i] = score
+                best_iter[i] = env.iteration
+                best_score_list[i] = env.evaluation_result_list
+                best_trainloss[i] = train_loss_val
+            if reporter is not None:  # Report current best scores for iteration, used in HPO
+                if i == indices_to_check[0]:  # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric.
+                    if cmp_op[i] == gt:
+                        validation_perf = score
+                    else:
+                        validation_perf = -score
+                    reporter(epoch=env.iteration + 1,
+                             validation_performance=validation_perf,
+                             train_loss=best_trainloss[i],
+                             best_iter_sofar=best_iter[i] + 1,
+                             best_valperf_sofar=best_score[i],
+                             eval_metric=eval_metric,  # eval_metric here is the stopping_metric from LGBModel
+                             greater_is_better=greater_is_better,
+                             )
+            early_stop = es[i].update(cur_round=env.iteration, is_best=is_best_iter)
+            if early_stop:
+                if verbose:
+                    logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % (
+                        best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]])))
+                raise EarlyStopException(best_iter[i], best_score_list[i])
+            elif (max_diff is not None) and (abs(score - best_score[i]) > max_diff):
+                if verbose:
+                    logger.debug('max_diff breached!')
+                    logger.debug(abs(score - best_score[i]))
+                    logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % (
+                        best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]])))
+                raise EarlyStopException(best_iter[i], best_score_list[i])
+            if env.iteration == env.end_iteration - 1:
+                if verbose:
+                    logger.log(15, 'Did not meet early stopping criterion. Best iteration is:\n[%d]\t%s' % (
+                        best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]])))
+                raise EarlyStopException(best_iter[i], best_score_list[i])
+            if verbose:
+                logger.debug((env.iteration - best_iter[i], eval_result))
+        if manual_stop_file:
+            if os.path.exists(manual_stop_file):
+                i = indices_to_check[0]
+                logger.log(20, 'Found manual stop file, early stopping. Best iteration is:\n[%d]\t%s' % (
+                    best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]])))
+                raise EarlyStopException(best_iter[i], best_score_list[i])
+        if time_limit:
+            time_elapsed = time.time() - start_time
+            time_left = time_limit - time_elapsed
+            if time_left <= 0:
+                i = indices_to_check[0]
+                logger.log(20, '\tRan out of time, early stopping on iteration ' + str(env.iteration+1) + '. Best iteration is:\n\t[%d]\t%s' % (
+                    best_iter[i] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[i]])))
+                raise EarlyStopException(best_iter[i], best_score_list[i])
+
+        # TODO: Add toggle parameter to early_stopping to disable this
+        # TODO: Identify optimal threshold values for early_stopping based on lack of memory
+        if env.iteration % 10 == 0:
+            available = psutil.virtual_memory().available
+            cur_rss = mem_status.memory_info().rss
+
+            if cur_rss < init_mem_rss[0]:
+                init_mem_rss[0] = cur_rss
+            estimated_model_size_mb = (cur_rss - init_mem_rss[0]) >> 20
+            available_mb = available >> 20
+
+            model_size_memory_ratio = estimated_model_size_mb / available_mb
+            if verbose or (model_size_memory_ratio > 0.25):
+                logging.debug('Available Memory: '+str(available_mb)+' MB')
+                logging.debug('Estimated Model Size: '+str(estimated_model_size_mb)+' MB')
+
+            early_stop = False
+            if model_size_memory_ratio > 1.0:
+                logger.warning('Warning: Large GBM model size may cause OOM error if training continues')
+                logger.warning('Available Memory: '+str(available_mb)+' MB')
+                logger.warning('Estimated GBM model size: '+str(estimated_model_size_mb)+' MB')
+                early_stop = True
+
+            # TODO: We will want to track size of model as well, even if we early stop before OOM, we will still crash when saving if the model is large enough
+            if available_mb < 512:  # Less than 500 MB
+                logger.warning('Warning: Low available memory may cause OOM error if training continues')
+                logger.warning('Available Memory: '+str(available_mb)+' MB')
+                logger.warning('Estimated GBM model size: '+str(estimated_model_size_mb)+' MB')
+                early_stop = True
+
+            if early_stop:
+                logger.warning('Warning: Early stopped GBM model prior to optimal result to avoid OOM error. Please increase available memory to avoid subpar model quality.')
+                logger.log(15, 'Early stopping, best iteration is:\n[%d]\t%s' % (
+                        best_iter[0] + 1, '\t'.join([_format_eval_result(x) for x in best_score_list[0]])))
+                raise EarlyStopException(best_iter[0], best_score_list[0])
+
+    _callback.order = 30
+    return _callback
+
+def get_compatible_metric_dict(output_type: int) -> Dict[str, str]:
+	if output_type == BINARY:
+		return dict(
+        accuracy='binary_error',
+        log_loss='binary_logloss',
+        roc_auc='auc',
+    )
+	elif output_type == MULTICLASS:
+		return dict(
+        accuracy='multi_error',
+        log_loss='multi_logloss',
+    )
+	elif output_type == CONTINUOUS:
+		return dict(
+			mean_absolute_error='l1',
+			mean_squared_error='l2',
+			root_mean_squared_error='rmse',
+		)
+
+
+def get_metric(output_type: int, optimize_metric: str) -> str:
+	metric_dict = get_compatible_metric_dict(output_type=output_type)
+	return metric_dict.get(optimize_metric, list(metric_dict.values())[DEFAULT_METRIC_INDEX])
+
+def get_train_loss_name(output_type: int):
+	if output_type == BINARY:
+		train_loss_name = 'binary_logloss'
+	elif output_type == MULTICLASS:
+		train_loss_name = 'multi_logloss'
+	elif output_type == CONTINUOUS:
+		train_loss_name = 'l2'
+	else:
+		raise ValueError(f"unknown output_type for LGBModel: {output_type}")
+	return train_loss_name
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py
new file mode 100644
index 000000000..f368331ef
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/random_forest.py
@@ -0,0 +1,103 @@
+import logging.handlers
+import tempfile
+from typing import Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    RandomForestRegressor
+)
+
+from autoPyTorch.constants import MULTICLASS
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.random_forest.utils import get_params
+
+
+class RFModel(BaseTraditionalLearner):
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(RFModel, self).__init__(name="random_forest",
+                                      logger_port=logger_port,
+                                      random_state=random_state,
+                                      task_type=task_type,
+                                      output_type=output_type,
+                                      optimize_metric=optimize_metric,
+                                      dataset_properties=dataset_properties,
+                                      time_limit=time_limit,
+                                      params_func=get_params)
+        self.config.update(kwargs)
+
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+
+        self.config["warm_start"] = False
+        # TODO: Check if we need to warmstart for regression.
+        #  In autogluon, they warm start when usinf daal backend, see
+        #  ('https://github.com/awslabs/autogluon/blob/master/tabular/src/autogluon/tabular/models/rf/rf_model.py#L35')
+        if not self.is_classification:
+            self.model = RandomForestRegressor(**self.config, random_state=self.random_state)
+        else:
+            self.num_classes = len(np.unique(y_train))
+            if self.num_classes > 2:
+                self.logger.info("==> Using warmstarting for multiclass")
+                self.final_n_estimators = self.config["n_estimators"]
+                self.config["n_estimators"] = 8
+                self.config["warm_start"] = True
+            self.model = RandomForestClassifier(**self.config, random_state=self.random_state)
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray) -> None:
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+
+        self.model = self.model.fit(X_train, y_train)
+        if self.config["warm_start"]:
+            self.model.n_estimators = self.final_n_estimators
+            self.model.fit(X_train, y_train)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RFLearner',
+            'name': 'Random Forest Learner',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py
new file mode 100644
index 000000000..320ca413e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/random_forest/utils.py
@@ -0,0 +1,9 @@
+from typing import Any, Dict
+
+
+def get_params(output_type: int) -> Dict[str, Any]:
+
+	return {
+        "n_estimators" : 300,
+        'bootstrap': True
+        }
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py
deleted file mode 100644
index b45161aa9..000000000
--- a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/utils.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from enum import Enum
-
-
-class AutoPyTorchToCatboostMetrics(Enum):
-    mean_absolute_error = "MAE"
-    root_mean_squared_error = "RMSE"
-    mean_squared_log_error = "MSLE"
-    r2 = "R2"
-    accuracy = "Accuracy"
-    balanced_accuracy = "BalancedAccuracy"
-    f1 = "F1"
-    roc_auc = "AUC"
-    precision = "Precision"
-    recall = "Recall"
-    log_loss = "Logloss"
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py
new file mode 100644
index 000000000..31dd386b1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/early_stopping_custom.py
@@ -0,0 +1,90 @@
+import time
+import psutil
+import logging
+
+from xgboost.callback import EarlyStopping
+
+from autoPyTorch.utils.early_stopping import SimpleEarlyStopper
+
+logger = logging.getLogger(__name__)
+
+
+class EarlyStoppingCustom(EarlyStopping):
+    """
+    Augments early stopping in XGBoost to also consider time_limit, memory usage, and usage of adaptive early stopping methods.
+
+    Parameters
+    ----------
+    rounds : int or tuple
+       If int, The possible number of rounds without the trend occurrence.
+       If tuple, contains early stopping class as first element and class init kwargs as second element.
+    """
+    def __init__(self, rounds, time_limit=None, start_time=None, verbose=False, **kwargs):
+        if rounds is None:
+            # Disable early stopping via rounds
+            rounds = 999999
+        super().__init__(rounds=999999, **kwargs)
+        if isinstance(rounds, int):
+            self.es = SimpleEarlyStopper(patience=rounds)
+        else:
+            self.es = rounds[0](**rounds[1])
+        self.time_limit = time_limit
+        self.start_time = start_time
+        self.verbose = verbose
+        self._mem_status = None
+        self._mem_init_rss = None
+
+    def before_training(self, model):
+        model = super().before_training(model=model)
+        if self.start_time is None:
+            self.start_time = time.time()
+        self._mem_status = psutil.Process()
+        self._mem_init_rss = self._mem_status.memory_info().rss
+        return model
+
+    def after_iteration(self, model, epoch, evals_log):
+        should_stop = super().after_iteration(model, epoch, evals_log)
+        if should_stop:
+            return should_stop
+        is_best_iter = self.current_rounds == 0
+        should_stop = self.es.update(current_epoch=epoch, is_best=is_best_iter)
+        if should_stop:
+            return should_stop
+        if self._time_check(model=model, epoch=epoch):
+            return True
+        if epoch % 10 == 0 and self._memory_check(model=model):
+            return True
+        return should_stop
+
+    def _time_check(self, model, epoch):
+        if self.time_limit is not None:
+            time_elapsed = time.time() - self.start_time
+            time_left = self.time_limit - time_elapsed
+            if time_left <= 0:
+                if self.verbose:
+                    logger.log(20, f"Ran out of time, early stopping on iteration {epoch}. Best iteration is: \t[{model.attr('best_iteration')}]\t{model.attr('best_score')}")
+                return True
+        return False
+
+    def _memory_check(self, model):
+        available = psutil.virtual_memory().available
+        cur_rss = self._mem_status.memory_info().rss
+        if cur_rss < self._mem_init_rss:
+            self._mem_init_rss = cur_rss
+        estimated_model_size_mb = (cur_rss - self._mem_init_rss) >> 20
+        available_mb = available >> 20
+
+        model_size_memory_ratio = estimated_model_size_mb / available_mb
+
+        if (model_size_memory_ratio > 1.0) or (available_mb < 512):
+            logger.warning('Warning: Large XGB model size may cause OOM error if training continues')
+            logger.warning(f'Available Memory: {available_mb} MB')
+            logger.warning(f'Estimated XGB model size: {estimated_model_size_mb} MB')
+            if self.verbose:
+                logger.warning(f'Warning: Early stopped XGB model prior to optimal result to avoid OOM error. Please increase available memory to avoid subpar model quality.\n')
+                logger.warning(f"Early stopping. Best iteration is: \t[{model.attr('best_iteration')}]\t{model.attr('best_score')}")
+            return True
+        elif self.verbose and (model_size_memory_ratio > 0.25):
+            logger.log(15, f'Available Memory: {available_mb} MB')
+            logger.log(15, f'Estimated XGB model size: {estimated_model_size_mb} MB')
+        return False
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py
new file mode 100644
index 000000000..4124c171b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/utils.py
@@ -0,0 +1,85 @@
+from typing import Dict
+from autoPyTorch.constants import BINARY, MULTICLASS, CONTINUOUS
+from enum import Enum
+
+
+DEFAULT_METRIC_INDEX = 0
+
+
+def get_compatible_metric_dict(output_type: int) -> Dict[str, str]:
+	if output_type == BINARY:
+		return dict(
+        accuracy='error',
+        log_loss='logloss',
+        roc_auc='auc',
+    )
+	elif output_type == MULTICLASS:
+		return dict(
+        accuracy='merror',
+        log_loss='mlogloss',
+    )
+	elif output_type == CONTINUOUS:
+		return dict(
+			mean_absolute_error='mae',
+			root_mean_squared_error='rmse',
+		)
+
+def get_metric(output_type: int, optimize_metric: str) -> str:
+	metric_dict = get_compatible_metric_dict(output_type=output_type)
+	return metric_dict.get(optimize_metric, list(metric_dict.values())[DEFAULT_METRIC_INDEX])
+
+
+DEFAULT_NUM_BOOST_ROUND = 10000
+# Options: [10, 100, 200, 300, 400, 500, 1000, 10000]
+
+
+def get_param_baseline(output_type):
+    if output_type == BINARY:
+        return get_param_binary_baseline()
+    elif output_type == MULTICLASS:
+        return get_param_multiclass_baseline()
+    elif output_type == CONTINUOUS:
+        return get_param_regression_baseline()
+    else:
+        return get_param_binary_baseline()
+
+
+def get_base_params():
+    base_params = {
+        'n_estimators': DEFAULT_NUM_BOOST_ROUND,
+        'learning_rate': 0.1,
+        'n_jobs': -1,
+    }
+    return base_params
+
+
+def get_param_binary_baseline():
+    params = get_base_params()
+    baseline_params = {
+        'objective': 'binary:logistic',
+        'booster': 'gbtree',
+        'use_label_encoder': False,
+    }
+    params.update(baseline_params)
+    return params
+
+
+def get_param_multiclass_baseline():
+    params = get_base_params()
+    baseline_params = {
+        'objective': 'multi:softmax',
+        'booster': 'gbtree',
+        'use_label_encoder': False,
+    }
+    params.update(baseline_params)
+    return params
+
+
+def get_param_regression_baseline():
+    params = get_base_params()
+    baseline_params = {
+        'objective': 'reg:squarederror',
+        'booster': 'gbtree',
+    }
+    params.update(baseline_params)
+    return params
diff --git a/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py
new file mode 100644
index 000000000..f95427085
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/traditional_ml/traditional_learner/xgboost/xgboost.py
@@ -0,0 +1,198 @@
+import logging.handlers
+from time import time
+from typing import Dict, Optional, Union
+
+import logging
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter,
+    UniformFloatHyperparameter
+)
+
+import numpy as np
+
+from autoPyTorch.pipeline.base_pipeline import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.base_traditional_learner import \
+    BaseTraditionalLearner
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.utils import get_metric, get_param_baseline as xgb_get_params
+from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner.xgboost.early_stopping_custom import EarlyStoppingCustom
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.early_stopping import get_early_stopping_rounds
+
+
+class XGBModel(BaseTraditionalLearner):
+    def __init__(self,
+                 task_type: str,
+                 output_type: str,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                 optimize_metric: Optional[str] = None,
+                 logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+                 random_state: Optional[np.random.RandomState] = None,
+                 time_limit: Optional[int] = None,
+                 **kwargs
+                 ):
+        super(XGBModel, self).__init__(name="xgboost",
+                                       logger_port=logger_port,
+                                       random_state=random_state,
+                                       task_type=task_type,
+                                       output_type=output_type,
+                                       optimize_metric=optimize_metric,
+                                       dataset_properties=dataset_properties,
+                                       time_limit=time_limit,
+                                       params_func=xgb_get_params)
+        self.config.update(kwargs)
+        self.encoder = None
+
+    def _prepare_model(self,
+                       X_train: np.ndarray,
+                       y_train: np.ndarray
+                       ) -> None:
+        from xgboost import XGBClassifier, XGBRegressor
+        self.eval_metric = get_metric(self.output_type, optimize_metric=self.metric.name)
+        # avoid unnecessary warnings
+        self.config['eval_metric'] = get_metric(self.output_type, optimize_metric=self.metric.name)
+        if not self.is_classification:
+            self.model = XGBRegressor(**self.config, random_state=self.random_state)
+        else:
+            self.config["num_class"] = len(np.unique(y_train)) if len(np.unique(y_train)) != 2 else 1  # this fixes a bug
+
+            self.model = XGBClassifier(**self.config, random_state=self.random_state)
+
+    def _fit(self, X_train: np.ndarray,
+             y_train: np.ndarray,
+             X_val: np.ndarray,
+             y_val: np.ndarray
+             ) -> None:
+        start_time = time()
+     
+        assert self.model is not None, "No model found. Can't fit without preparing the model"
+        eval_set = []
+        if X_val is None:
+            early_stopping_rounds = None
+            eval_set = None
+        else:
+            eval_set.append((X_val, y_val))
+            early_stopping_rounds = get_early_stopping_rounds(X_train.shape[0])
+
+        callbacks = []
+        if eval_set is not None:
+            callbacks.append(EarlyStoppingCustom(early_stopping_rounds, start_time=start_time, time_limit=self.time_limit))
+        self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.eval_metric, callbacks=callbacks, verbose=False)
+
+    def _preprocess(self,
+                    X: np.ndarray
+                    ) -> np.ndarray:
+        from sklearn.compose import make_column_transformer
+        from sklearn.preprocessing import OneHotEncoder
+
+        super(XGBModel, self)._preprocess(X)
+
+        if len(self.dataset_properties['categorical_columns']) > 0:
+            if self.encoder is None:
+                self.encoder = make_column_transformer((OneHotEncoder(sparse=False, handle_unknown='ignore'), self.dataset_properties['categorical_columns']), remainder="passthrough")
+                self.encoder.fit(X)
+            X = self.encoder.transform(X)   
+
+        return X
+
+    def predict(self, X_test: np.ndarray,
+                predict_proba: bool = False,
+                preprocess: bool = True) -> np.ndarray:
+        assert self.model is not None, "No model found. Can't " \
+                                       "predict before fitting. " \
+                                       "Call fit before predicting"
+        if preprocess:
+            X_test = self._preprocess(X_test)
+
+        if predict_proba:
+            if not self.is_classification:
+                raise ValueError("Can't predict probabilities for a regressor")
+            y_pred_proba = self.model.predict_proba(X_test)
+            if self.num_classes == 2:
+                y_pred_proba = y_pred_proba.transpose()[0:len(X_test)]
+            return y_pred_proba
+
+        y_pred = self.model.predict(X_test)
+        return y_pred
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        learning_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='learning_rate',
+            value_range=(5e-3, 0.2),
+            default_value=0.1,
+            log=True
+        ),
+        max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='max_depth',
+            value_range=(3, 10),
+            default_value=6,
+        ),
+        min_child_weight: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='min_child_weight',
+            value_range=(1, 5),
+            default_value=1,
+        ),
+        gamma: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='gamma',
+            value_range=(0, 5),
+            default_value=0.01,
+        ),
+        subsample: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='subsample',
+            value_range=(0.5, 1),
+            default_value=1,
+        ),
+        colsample_bytree: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='colsample_bytree',
+            value_range=(0.5, 1),
+            default_value=1,
+        ),
+        reg_alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='reg_alpha',
+            value_range=(0, 10),
+            default_value=0,
+        ),
+        reg_lambda: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='reg_lambda',
+            value_range=(0, 10),
+            default_value=0,
+        ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, colsample_bytree, UniformFloatHyperparameter)
+        add_hyperparameter(cs, subsample, UniformFloatHyperparameter)
+        add_hyperparameter(cs, reg_alpha, UniformFloatHyperparameter)
+        add_hyperparameter(cs, gamma, UniformFloatHyperparameter)
+        add_hyperparameter(cs, min_child_weight, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, learning_rate, UniformFloatHyperparameter)
+        add_hyperparameter(cs, reg_lambda, UniformFloatHyperparameter)
+        add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
+
+        return cs
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'XGBLearner',
+            'name': 'Xtreme Gradient Boosting Machine Learner',
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index 0cea0b2c7..38b508ae4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -31,10 +31,10 @@ class BaseDataLoaderComponent(autoPyTorchTrainingComponent):
 
     """
 
-    def __init__(self, batch_size: int = 64,
+    def __init__(self, max_batch_size: int = 64,
                  random_state: Optional[np.random.RandomState] = None) -> None:
         super().__init__(random_state=random_state)
-        self.batch_size = batch_size
+        self.max_batch_size = max_batch_size
         self.train_data_loader: Optional[torch.utils.data.DataLoader] = None
         self.val_data_loader: Optional[torch.utils.data.DataLoader] = None
         self.test_data_loader: Optional[torch.utils.data.DataLoader] = None
@@ -108,6 +108,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
 
+        self.batch_size = min(int(2 ** (3 + np.floor(np.log10(len(train_dataset))))), self.max_batch_size)
+
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=min(self.batch_size, len(train_dataset)),
@@ -258,13 +260,13 @@ def get_torchvision_datasets(self) -> Dict[str, torchvision.datasets.VisionDatas
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        batch_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="batch_size",
+        max_batch_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_batch_size",
                                                                           value_range=(32, 320),
                                                                           default_value=64,
                                                                           log=True)
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, max_batch_size, UniformIntegerHyperparameter)
 
         return cs
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index b380659da..e7f103f24 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -35,8 +35,10 @@
 )
 from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead, update_model_state_dict_from_swa
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, get_device_from_fit_dictionary
+from autoPyTorch.utils.early_stopping import AbstractEarlyStopper, SimpleEarlyStopper
 from autoPyTorch.utils.logging_ import get_named_client_logger
 
+
 trainer_directory = os.path.split(__file__)[0]
 _trainers = find_components(__package__,
                             trainer_directory,
@@ -68,6 +70,7 @@ def __init__(self,
         self.run_summary: Optional[RunSummary] = None
         self.writer: Optional[SummaryWriter] = None
         self.early_stopping_split_type: Optional[str] = None
+        self.early_stopper: Optional[AbstractEarlyStopper] = None
         self._fit_requirements: Optional[List[FitRequirement]] = [
             FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
             FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
@@ -338,6 +341,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
         additional_losses = X['additional_losses'] if 'additional_losses' in X else None
 
         labels = self._get_train_label(X)
+        # Allow to disable early stopping
+        if X['early_stopping'] is not None or X['early_stopping'] >= 0:
+            self.early_stopper = SimpleEarlyStopper(patience=X['early_stopping'])
 
         self.choice.prepare(
             model=X['network'],
@@ -481,7 +487,7 @@ def _get_train_label(self, X: Dict[str, Any]) -> List[int]:
         Verifies and validates the labels from train split.
         """
         # Ensure that the split is not missing any class.
-        labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]]
+        labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['repeat_id']][X['split_id']][0]]
         if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS:
             unique_labels = len(np.unique(labels))
             if unique_labels < X['dataset_properties']['output_shape']:
@@ -527,7 +533,7 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
         assert self.early_stopping_split_type is not None  # mypy
 
         # Allow to disable early stopping
-        if X['early_stopping'] is None or X['early_stopping'] < 0:
+        if self.early_stopper is None:
             return False
 
         # Store the best weights seen so far:
@@ -536,14 +542,14 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
 
         last_epoch = self.run_summary.get_last_epoch()
         best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type)
-        epochs_since_best = last_epoch - best_epoch
+        is_best = last_epoch == best_epoch
 
         # Save the checkpoint if there is a new best epoch
         best_path = os.path.join(self.checkpoint_dir, 'best.pth')
-        if epochs_since_best == 0:
+        if is_best:
             torch.save(X['network'].state_dict(), best_path)
 
-        return epochs_since_best > cast(int, X['early_stopping'])
+        return self.early_stopper.update(last_epoch, is_best)
 
     def eval_valid_each_epoch(self, X: Dict[str, Any]) -> bool:
         """
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
index 2e64a6944..376996ed3 100644
--- a/autoPyTorch/pipeline/tabular_classification.py
+++ b/autoPyTorch/pipeline/tabular_classification.py
@@ -18,6 +18,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
+    ColumnSplitter
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
     CoalescerChoice
 )
@@ -29,6 +32,7 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.skew_transformer import SkewTransformerChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
     VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
@@ -288,9 +292,11 @@ def _get_pipeline_steps(
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
             ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
-            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
+            # ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("column_splitter", ColumnSplitter(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("skew_transformer", SkewTransformerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
             ("tabular_transformer", TabularColumnTransformer(random_state=self.random_state)),
diff --git a/autoPyTorch/pipeline/traditional_tabular_classification.py b/autoPyTorch/pipeline/traditional_tabular_classification.py
index 8cdfeaf39..675ad9b43 100644
--- a/autoPyTorch/pipeline/traditional_tabular_classification.py
+++ b/autoPyTorch/pipeline/traditional_tabular_classification.py
@@ -11,9 +11,8 @@
 from autoPyTorch.pipeline.base_pipeline import BasePipeline, PipelineStepType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.traditional_ml import ModelChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
+from autoPyTorch.pipeline.components.setup.traditional_ml.tabular_traditional_model import TabularTraditionalModel
 
 class TraditionalTabularClassificationPipeline(ClassifierMixin, BasePipeline):
     """
@@ -229,7 +228,8 @@ def _get_pipeline_steps(
             default_dataset_properties.update(dataset_properties)
 
         steps.extend([
-            ("model_trainer", ModelChoice(default_dataset_properties,
+            ("model_trainer", TabularTraditionalModel(
+            # ModelChoice(default_dataset_properties,
                                           random_state=self.random_state)),
         ])
         return steps
@@ -257,14 +257,14 @@ def get_pipeline_representation(self) -> Dict[str, str]:
                 Contains the pipeline representation in a short format
         """
         estimator_name = 'TraditionalTabularClassification'
-        if self.steps[0][1].choice is not None:
-            if self.steps[0][1].choice.model is None:
-                estimator_name = self.steps[0][1].choice.__class__.__name__
-            else:
-                estimator_name = cast(
-                    str,
-                    self.steps[0][1].choice.model.get_properties()['shortname']
-                )
+        # if self.steps[0][1].choice is not None:
+        if self.steps[0][1].model is None:
+            estimator_name = self.steps[0][1].model.__class__.__name__
+        else:
+            estimator_name = cast(
+                str,
+                self.steps[0][1].model.get_properties()['shortname']
+            )
         return {
             'Preprocessing': 'None',
             'Estimator': estimator_name,
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index d37a0c182..ec8a03b06 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -1,7 +1,9 @@
+import copy
 from enum import Enum
+from math import floor
 from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union
 
-from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
     Constant,
@@ -22,6 +24,8 @@
 HyperparameterValueType = Union[int, str, float]
 
 
+ENSEMBLE_ITERATION_MULTIPLIER = 1e8
+
 def ispandas(X: Any) -> bool:
     """ Whether X is pandas.DataFrame or pandas.Series """
     return hasattr(X, "iloc")
@@ -283,3 +287,41 @@ def check_none(p: Any) -> bool:
     if p in ("None", "none", None):
         return True
     return False
+
+
+def validate_config(config, search_space: ConfigurationSpace, n_numerical_in_incumbent_on_task_id, num_numerical, assert_autogluon_numerical_hyperparameters: bool=False):
+    modified_config = config.get_dictionary().copy() if isinstance(config, Configuration) else config.copy()
+
+    if num_numerical > 0:
+        imputer_numerical_hyperparameter = "imputer:numerical_strategy" 
+        if imputer_numerical_hyperparameter not in modified_config:
+            modified_config[imputer_numerical_hyperparameter] = search_space.get_hyperparameter(imputer_numerical_hyperparameter).default_value if not assert_autogluon_numerical_hyperparameters else 'median'
+        if assert_autogluon_numerical_hyperparameters:
+            quantile_hp_name = 'QuantileTransformer'
+            skew_transformer_choice = modified_config.get('skew_transformer:__choice__', None)
+            if skew_transformer_choice is not None:
+                if skew_transformer_choice != quantile_hp_name:
+                    to_remove_hps = [hyp.name for hyp in search_space.get_children_of('skew_transformer:__choice__') if skew_transformer_choice in hyp.name]
+                    [modified_config.pop(remove_hp, None) for remove_hp in to_remove_hps]
+
+            to_add_hps = [hyp for hyp in search_space.get_children_of('skew_transformer:__choice__') if quantile_hp_name in hyp.name]
+            modified_config['skew_transformer:__choice__'] = quantile_hp_name
+            for add_hp in to_add_hps:
+                modified_config[add_hp.name] = add_hp.default_value
+
+    feature_preprocessing_choice = modified_config['feature_preprocessor:__choice__']
+
+    to_adjust_hyperparams = ['n_clusters', 'n_components', 'target_dim']
+    children_hyperparameters = [hyp for hyp in search_space.get_children_of('feature_preprocessor:__choice__') if feature_preprocessing_choice in hyp.name]
+    for hyp in children_hyperparameters:
+        children = search_space.get_children_of(hyp)
+        if len(children) > 0:
+            children_hyperparameters.extend(children)
+    children_hyperparameters = [hyp for hyp in children_hyperparameters if hyp.name in modified_config and any([ta_hyp in hyp.name for ta_hyp in to_adjust_hyperparams])]
+
+    for child_hyperparam in children_hyperparameters:
+        modified_config[child_hyperparam.name] = floor(modified_config[child_hyperparam.name]/n_numerical_in_incumbent_on_task_id * num_numerical)
+
+    return Configuration(search_space, modified_config)
+
+
diff --git a/autoPyTorch/utils/data_classes.py b/autoPyTorch/utils/data_classes.py
new file mode 100644
index 000000000..4d031253a
--- /dev/null
+++ b/autoPyTorch/utils/data_classes.py
@@ -0,0 +1,27 @@
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.constants import (
+    STRING_TO_TASK_TYPES,
+    TABULAR_TASKS,
+    IMAGE_TASKS
+)
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.datasets.image_dataset import ImageDataset
+
+from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.tabular_validator import TabularInputValidator
+
+
+def get_dataset_class(task_type: str) -> BaseDataset:
+    if STRING_TO_TASK_TYPES[task_type] in TABULAR_TASKS:
+        return TabularDataset
+    elif STRING_TO_TASK_TYPES[task_type] in IMAGE_TASKS:
+        return ImageDataset
+
+
+def get_data_validator_class(task_type: str) -> BaseInputValidator:
+    if STRING_TO_TASK_TYPES[task_type] in TABULAR_TASKS:
+        return TabularInputValidator
+    elif STRING_TO_TASK_TYPES[task_type] in IMAGE_TASKS:
+        return None
diff --git a/autoPyTorch/utils/early_stopping.py b/autoPyTorch/utils/early_stopping.py
new file mode 100644
index 000000000..3bb3d9c32
--- /dev/null
+++ b/autoPyTorch/utils/early_stopping.py
@@ -0,0 +1,47 @@
+"""
+Implementation from autogluon early_stopping
+"""
+from abc import ABC
+
+
+class AbstractEarlyStopper(ABC):
+    """
+    Abstract class for early stopping
+    """
+    def update(self, current_epoch, is_best=False) -> bool:
+        raise NotImplementedError
+
+    def early_stop(self, current_epoch, is_best=False) -> bool:
+        raise NotImplementedError
+
+
+class SimpleEarlyStopper(AbstractEarlyStopper):
+    """
+    Implements early stopping with fixed patience
+    Args:
+    patience : int, default 10
+        If no improvement occurs in `patience` epochs or greater, self.early_stop will return True.
+    """
+    def __init__(self, patience=10):
+        self.patience = patience
+        self.best_epoch = 0
+
+    def update(self, current_epoch, is_best=False):
+        if is_best:
+            self.best_epoch = current_epoch
+        return self.early_stop(current_epoch, is_best=is_best)
+
+    def early_stop(self, current_epoch, is_best=False):
+        if is_best:
+            return False
+        return current_epoch - self.best_epoch >= self.patience
+
+
+def get_early_stopping_rounds(num_rows_train, min_patience=20, max_patience=300, min_rows=10000):
+
+    modifier = 1 if num_rows_train <= min_rows else min_rows / num_rows_train
+    simple_early_stopping_rounds = max(
+        round(modifier * max_patience),
+        min_patience,
+    )
+    return simple_early_stopping_rounds
diff --git a/autoPyTorch/utils/parallel_model_runner.py b/autoPyTorch/utils/parallel_model_runner.py
new file mode 100644
index 000000000..083296038
--- /dev/null
+++ b/autoPyTorch/utils/parallel_model_runner.py
@@ -0,0 +1,167 @@
+import time
+import math
+from typing import Any, Dict, List, Tuple, Union
+import unittest
+
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import dask.distributed
+
+from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
+from smac.stats.stats import Stats
+from smac.tae import StatusType
+
+from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.utils.common import dict_repr
+
+
+def run_models_on_dataset(
+    time_left: int,
+    func_eval_time_limit_secs: int,
+    model_configs: List[Tuple[Union[str, Configuration]]],
+    logger,
+    logger_port,
+    metric,
+    dask_client: dask.distributed.Client,
+    backend: Backend,
+    memory_limit: int,
+    disable_file_output,
+    all_supported_metrics: bool,
+    ensemble_method,
+    include,
+    exclude,
+    search_space_updates,
+    pipeline_options,
+    seed: int,
+    multiprocessing_context,
+    n_jobs: int,
+    current_search_space: ConfigurationSpace,
+    smac_initial_run: int
+) -> RunHistory:
+    starttime = time.time()
+    run_history = RunHistory()
+    memory_limit = memory_limit
+    if memory_limit is not None:
+        memory_limit = int(math.ceil(memory_limit))
+    model_identifiers = []
+    total_models = len(model_configs)
+    dask_futures = []
+    for n_r, (config, budget) in enumerate(model_configs):
+
+        # Only launch a task if there is time
+        start_time = time.time()
+        if time_left >= func_eval_time_limit_secs:
+            logger.info(f"{n_r}: Started fitting {config} with cutoff={func_eval_time_limit_secs}")
+            scenario_mock = unittest.mock.Mock()
+            scenario_mock.wallclock_limit = time_left
+            # This stats object is a hack - maybe the SMAC stats object should
+            # already be generated here!
+            stats = Stats(scenario_mock)
+            stats.start_timing()
+
+            if isinstance(config, Configuration):
+                config.config_id = n_r
+                init_num_run = smac_initial_run
+            else:
+                init_num_run = smac_initial_run + n_r
+
+            ta = ExecuteTaFuncWithQueue(
+                pynisher_context=multiprocessing_context,
+                backend=backend,
+                seed=seed,
+                metric=metric,
+                multi_objectives=["cost"],
+                logger_port=logger_port,
+                pipeline_config=pipeline_options,
+                cost_for_crash=get_cost_of_crash(metric),
+                abort_on_first_run_crash=False,
+                initial_num_run=init_num_run,
+                stats=stats,
+                memory_limit=memory_limit,
+                disable_file_output=disable_file_output,
+                all_supported_metrics=all_supported_metrics,
+                ensemble_method=ensemble_method,
+                include=include,
+                exclude=exclude,
+                search_space_updates=search_space_updates
+            )
+            dask_futures.append([
+                config,
+                dask_client.submit(
+                    ta.run, config=config,
+                    cutoff=func_eval_time_limit_secs,
+                    budget=budget
+                )
+            ])
+
+        # When managing time, we need to take into account the allocated time resources,
+        # which are dependent on the number of cores. 'dask_futures' is a proxy to the number
+        # of workers /n_jobs that we have, in that if there are 4 cores allocated, we can run at most
+        # 4 task in parallel. Every 'cutoff' seconds, we generate up to 4 tasks.
+        # If we only have 4 workers and there are 4 futures in dask_futures, it means that every
+        # worker has a task. We would not like to launch another job until a worker is available. To this
+        # end, the following if-statement queries the number of active jobs, and forces to wait for a job
+        # completion via future.result(), so that a new worker is available for the next iteration.
+        if len(dask_futures) >= n_jobs:
+
+            # How many workers to wait before starting fitting the next iteration
+            workers_to_wait = 1
+            if n_r >= total_models - 1 or time_left <= func_eval_time_limit_secs:
+                # If on the last iteration, flush out all tasks
+                workers_to_wait = len(dask_futures)
+
+            while workers_to_wait >= 1:
+                workers_to_wait -= 1
+                # We launch dask jobs only when there are resources available.
+                # This allow us to control time allocation properly, and early terminate
+                # the traditional machine learning pipeline
+                cls, future = dask_futures.pop(0)
+                status, cost, runtime, additional_info = future.result()
+
+                if status == StatusType.SUCCESS:
+                    logger.info(
+                        "Fitting {} took {} [sec] and got performance: {}.\n"
+                        "additional info:\n{}".format(cls, runtime, cost, dict_repr(additional_info))
+                    ) 
+                    origin = additional_info['configuration_origin']
+                    config = additional_info['configuration']
+                    budget = additional_info['budget']
+                    if isinstance(config, dict):
+                        configuration = Configuration(current_search_space, config)
+                    else:
+                        configuration = additional_info.pop('pipeline_configuration')
+
+                    # additional_info.pop('pipeline_configuration')
+                    run_history.add(config=configuration, cost=cost,
+                                    time=runtime, status=status, seed=seed,
+                                    starttime=starttime, endtime=starttime + runtime,
+                                    origin=origin, additional_info=additional_info)
+                    model_identifiers.append((seed, additional_info['num_run'], float(budget)))
+                else:
+                    if additional_info.get('exitcode') == -6:
+                        logger.error(
+                            "Traditional prediction for {} failed with run state {},\n"
+                            "because the provided memory limits were too tight.\n"
+                            "Please increase the 'ml_memory_limit' and try again.\n"
+                            "If you still get the problem, please open an issue\n"
+                            "and paste the additional info.\n"
+                            "Additional info:\n{}".format(cls, str(status), dict_repr(additional_info))
+                        )
+                    else:
+                        logger.error(
+                            "Traditional prediction for {} failed with run state {}.\nAdditional info:\n{}".format(
+                                cls, str(status), dict_repr(additional_info)
+                            )
+                        )
+                    model_identifiers.append(None)
+        # In the case of a serial execution, calling submit halts the run for a resource
+        # dynamically adjust time in this case
+        time_left -= int(time.time() - start_time)
+
+        # Exit if no more time is available for a new classifier
+        if time_left < func_eval_time_limit_secs:
+            logger.warning("Not enough time to fit all machine learning models."
+                           "Please consider increasing the run time to further improve performance.")
+            break
+    return run_history, model_identifiers
diff --git a/examples/20_basics/example_autogluon_ensemble.py b/examples/20_basics/example_autogluon_ensemble.py
new file mode 100644
index 000000000..8a8b692b7
--- /dev/null
+++ b/examples/20_basics/example_autogluon_ensemble.py
@@ -0,0 +1,105 @@
+"""
+======================
+Tabular Classification
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes
+
+from autoPyTorch.optimizer.utils import autoPyTorchSMBO
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import openml
+import sklearn.model_selection
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
+
+############################################################################
+# Data Loading
+# ============
+task = openml.tasks.get_task(task_id=3917)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='dataframe',
+    target=dataset.default_target_attribute,
+)
+
+train_indices, test_indices = task.get_train_test_split_indices()
+# AutoPyTorch fails when it is given a y DataFrame with False and True
+# values and category as dtype. in its inner workings it uses sklearn
+# which cannot detect the column type.
+if isinstance(y[1], bool):
+    y = y.astype('bool')
+
+# uncomment only for np.arrays
+
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    # To maintain logs of the run, you can uncomment the
+    # Following lines
+    temporary_directory='./tmp/stacking_autogluon_tmp_10',
+    output_directory='./tmp/stacking_autogluon_out_10',
+    delete_tmp_folder_after_terminate=False,
+    delete_output_folder_after_terminate=False,
+    seed=1,
+    ensemble_method=EnsembleSelectionTypes.stacking_autogluon,
+    resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation,
+    resampling_strategy_args={
+        'num_splits': 2,
+        'num_repeats': 1
+    },
+    ensemble_size=6,
+    num_stacking_layers=1,
+    feat_type=feat_type
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.run_autogluon_stacking(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    dataset_name='Australian',
+    optimize_metric='accuracy',
+    total_walltime_limit=600,
+    func_eval_time_limit_secs=130,
+    all_supported_metrics=False,
+    max_budget=10
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test, metric='accuracy')
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
+
+# Print statistics from search
+# print(api.sprint_statistics())
+
diff --git a/examples/20_basics/example_stacking_ensemble.py b/examples/20_basics/example_stacking_ensemble.py
index e3d7c308a..100341c84 100644
--- a/examples/20_basics/example_stacking_ensemble.py
+++ b/examples/20_basics/example_stacking_ensemble.py
@@ -8,6 +8,9 @@
 import os
 import tempfile as tmp
 import warnings
+from autoPyTorch.api.utils import get_autogluon_default_nn_config
+
+from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes
 
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
@@ -17,8 +20,7 @@
 warnings.simplefilter(action='ignore', category=UserWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
-import sklearn.datasets
-import sklearn.model_selection
+import openml
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
@@ -27,53 +29,82 @@
 ############################################################################
 # Data Loading
 # ============
-X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
-X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-    X,
-    y,
-    random_state=1,
+task = openml.tasks.get_task(task_id=146821)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='dataframe',
+    target=dataset.default_target_attribute,
 )
 
+train_indices, test_indices = task.get_train_test_split_indices()
+# AutoPyTorch fails when it is given a y DataFrame with False and True
+# values and category as dtype. in its inner workings it uses sklearn
+# which cannot detect the column type.
+if isinstance(y[1], bool):
+    y = y.astype('bool')
+
+# uncomment only for np.arrays
+
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type)
 ############################################################################
 # Build and fit a classifier
 # ==========================
-api = TabularClassificationTask(
-    # To maintain logs of the run, you can uncomment the
-    # Following lines
-    temporary_directory='./tmp/autoPyTorch_example_tmp_02',
-    output_directory='./tmp/autoPyTorch_example_out_02',
-    delete_tmp_folder_after_terminate=False,
-    delete_output_folder_after_terminate=False,
-    seed=42,
-    ensemble_method=EnsembleSelectionTypes.stacking_ensemble,
-    ensemble_size=5
-)
+if __name__ == '__main__':
+    api = TabularClassificationTask(
+        # To maintain logs of the run, you can uncomment the
+        # Following lines
+        temporary_directory='./tmp/stacking_optimisation_ensemble_tmp_24',
+        output_directory='./tmp/stacking_optimisation_ensemble_out_24',
+        delete_tmp_folder_after_terminate=False,
+        delete_output_folder_after_terminate=False,
+        seed=4,
+        ensemble_method=EnsembleSelectionTypes.stacking_optimisation_ensemble,
+        resampling_strategy=RepeatedCrossValTypes.stratified_repeated_k_fold_cross_validation,
+        ensemble_size=5,
+        num_stacking_layers=1,
+        resampling_strategy_args={
+            'num_splits': 5,
+            'num_repeats': 2
+        },
+        search_space_updates=search_space_updates,
+        n_jobs=1
+    )
 
-############################################################################
-# Search for an ensemble of machine learning algorithms
-# =====================================================
-api.search(
-    X_train=X_train,
-    y_train=y_train,
-    X_test=X_test.copy(),
-    y_test=y_test.copy(),
-    dataset_name='Australian',
-    optimize_metric='accuracy',
-    total_walltime_limit=1000,
-    func_eval_time_limit_secs=50,
-    enable_traditional_pipeline=False,
-    smbo_class=autoPyTorchSMBO,
-    all_supported_metrics=False
-)
+    ############################################################################
+    # Search for an ensemble of machine learning algorithms
+    # =====================================================
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        dataset_name='Australian',
+        optimize_metric='balanced_accuracy',
+        total_walltime_limit=500,
+        func_eval_time_limit_secs=70,
+        enable_traditional_pipeline=True,
+        smbo_class=autoPyTorchSMBO,
+        all_supported_metrics=False,
+        # use_ensemble_opt_loss=True,
+        posthoc_ensemble_fit_stacking_ensemble_optimization=True,
+        max_budget=10
+    )
 
-############################################################################
-# Print the final ensemble performance
-# ====================================
-y_pred = api.predict(X_test)
-score = api.score(y_pred, y_test, metric='accuracy')
-print(score)
-# Print the final ensemble built by AutoPyTorch
-print(api.show_models())
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test, metric='accuracy')
+    print(score)
+    # Print the final ensemble built by AutoPyTorch
+    print(api.show_models())
 
-# Print statistics from search
-# print(api.sprint_statistics())
\ No newline at end of file
+    # Print statistics from search
+    # print(api.sprint_statistics())
\ No newline at end of file
diff --git a/examples/20_basics/example_stacking_ensemble_selection_base.py b/examples/20_basics/example_stacking_ensemble_selection_base.py
new file mode 100644
index 000000000..58e6ad378
--- /dev/null
+++ b/examples/20_basics/example_stacking_ensemble_selection_base.py
@@ -0,0 +1,109 @@
+"""
+======================
+Tabular Classification
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+from autoPyTorch.api.utils import get_autogluon_default_nn_config
+from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes
+
+from autoPyTorch.optimizer.utils import autoPyTorchSMBO
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import openml
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
+
+############################################################################
+# Data Loading
+# ============
+task = openml.tasks.get_task(task_id=3917)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='dataframe',
+    target=dataset.default_target_attribute,
+)
+
+train_indices, test_indices = task.get_train_test_split_indices()
+# AutoPyTorch fails when it is given a y DataFrame with False and True
+# values and category as dtype. in its inner workings it uses sklearn
+# which cannot detect the column type.
+if isinstance(y[1], bool):
+    y = y.astype('bool')
+
+# uncomment only for np.arrays
+
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type)
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    # To maintain logs of the run, you can uncomment the
+    # Following lines
+    temporary_directory='./tmp/stacking_repeat_base_models_tmp_07',
+    output_directory='./tmp/stacking_repeat_base_models_out_07',
+    delete_tmp_folder_after_terminate=False,
+    delete_output_folder_after_terminate=False,
+    seed=1,
+    ensemble_method=EnsembleSelectionTypes.stacking_repeat_models,
+    resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation,
+    resampling_strategy_args={
+        'num_splits': 2,
+        'num_repeats': 1
+    },
+    ensemble_size=5,
+    num_stacking_layers=2,
+    search_space_updates=search_space_updates
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    dataset_name='Australian',
+    optimize_metric='accuracy',
+    total_walltime_limit=900,
+    func_eval_time_limit_secs=150,
+    enable_traditional_pipeline=True,
+    # smbo_class=autoPyTorchSMBO,
+    all_supported_metrics=False,
+    min_budget=5,
+    max_budget=10
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test, metric='accuracy')
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
+
+# Print statistics from search
+# print(api.sprint_statistics())
+
diff --git a/examples/20_basics/example_stacking_ensemble_selection_per_layer.py b/examples/20_basics/example_stacking_ensemble_selection_per_layer.py
new file mode 100644
index 000000000..37f40f850
--- /dev/null
+++ b/examples/20_basics/example_stacking_ensemble_selection_per_layer.py
@@ -0,0 +1,107 @@
+"""
+======================
+Tabular Classification
+======================
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+from autoPyTorch.api.utils import get_autogluon_default_nn_config
+
+from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import openml
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
+from autoPyTorch.optimizer.utils import autoPyTorchSMBO
+
+############################################################################
+# Data Loading
+# ============
+task = openml.tasks.get_task(task_id=3917)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='dataframe',
+    target=dataset.default_target_attribute,
+)
+
+train_indices, test_indices = task.get_train_test_split_indices()
+# AutoPyTorch fails when it is given a y DataFrame with False and True
+# values and category as dtype. in its inner workings it uses sklearn
+# which cannot detect the column type.
+if isinstance(y[1], bool):
+    y = y.astype('bool')
+
+# uncomment only for np.arrays
+
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type)
+############################################################################
+# Build and fit a classifier
+# ==========================
+if __name__ == '__main__':
+    api = TabularClassificationTask(
+        # To maintain logs of the run, you can uncomment the
+        # Following lines
+        temporary_directory='./tmp/stacking_ensemble_selection_per_layer_tmp_09',
+        output_directory='./tmp/stacking_ensemble_selection_per_layer_out_09',
+        delete_tmp_folder_after_terminate=False,
+        delete_output_folder_after_terminate=False,
+        seed=4,
+        ensemble_method=EnsembleSelectionTypes.stacking_ensemble_selection_per_layer,
+        resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation,
+        ensemble_size=5,
+        num_stacking_layers=2,
+        resampling_strategy_args={
+            'num_splits': 5,
+            'num_repeats': 2
+        },
+        search_space_updates=search_space_updates,
+    )
+
+    ############################################################################
+    # Search for an ensemble of machine learning algorithms
+    # =====================================================
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        dataset_name='Australian',
+        optimize_metric='balanced_accuracy',
+        total_walltime_limit=900,
+        func_eval_time_limit_secs=150,
+        enable_traditional_pipeline=False,
+        all_supported_metrics=False,
+        min_budget=5,
+        max_budget=15
+    )
+
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test, metric='accuracy')
+    print(score)
+    # Print the final ensemble built by AutoPyTorch
+    print(api.show_models())
+
+    # Print statistics from search
+    # print(api.sprint_statistics())
\ No newline at end of file
diff --git a/examples/20_basics/example_tabular_classification.py b/examples/20_basics/example_tabular_classification.py
index 636281eff..d57a42bb3 100644
--- a/examples/20_basics/example_tabular_classification.py
+++ b/examples/20_basics/example_tabular_classification.py
@@ -10,6 +10,9 @@
 import tempfile as tmp
 import warnings
 
+from autoPyTorch.datasets.resampling_strategy import RepeatedCrossValTypes
+from autoPyTorch.api.utils import get_autogluon_default_nn_config
+
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
@@ -18,33 +21,52 @@
 warnings.simplefilter(action='ignore', category=UserWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
-import sklearn.datasets
-import sklearn.model_selection
+import openml
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
-
+from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
 
 ############################################################################
 # Data Loading
 # ============
-X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
-X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-    X,
-    y,
-    random_state=1,
+task = openml.tasks.get_task(task_id=146821)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='dataframe',
+    target=dataset.default_target_attribute,
 )
 
+train_indices, test_indices = task.get_train_test_split_indices()
+# AutoPyTorch fails when it is given a y DataFrame with False and True
+# values and category as dtype. in its inner workings it uses sklearn
+# which cannot detect the column type.
+if isinstance(y[1], bool):
+    y = y.astype('bool')
+
+# uncomment only for np.arrays
+
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+feat_type = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+search_space_updates = get_autogluon_default_nn_config(feat_type=feat_type)
 ############################################################################
 # Build and fit a classifier
 # ==========================
 api = TabularClassificationTask(
     # To maintain logs of the run, you can uncomment the
     # Following lines
-    # temporary_directory='./tmp/autoPyTorch_example_tmp_01',
-    # output_directory='./tmp/autoPyTorch_example_out_01',
-    # delete_tmp_folder_after_terminate=False,
-    # delete_output_folder_after_terminate=False,
+    temporary_directory='./tmp/autoPyTorch_example_tmp_06',
+    output_directory='./tmp/autoPyTorch_example_out_06',
+    delete_tmp_folder_after_terminate=False,
+    delete_output_folder_after_terminate=False,
     seed=42,
+    ensemble_size=5,
+    resampling_strategy=RepeatedCrossValTypes.repeated_k_fold_cross_validation,
+    search_space_updates=search_space_updates
 )
 
 ############################################################################
@@ -57,8 +79,9 @@
     y_test=y_test.copy(),
     dataset_name='Australian',
     optimize_metric='accuracy',
-    total_walltime_limit=300,
-    func_eval_time_limit_secs=50
+    total_walltime_limit=1200,
+    func_eval_time_limit_secs=200,
+    enable_traditional_pipeline=True
 )
 
 ############################################################################
diff --git a/requirements.txt b/requirements.txt
index 1f2dd38b6..f458ef51a 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,10 +10,12 @@ imgaug>=0.4.0
 ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
-smac>=1.2
+smac==1.2
 dask
 distributed>=2.2.0
-catboost
-lightgbm
 flaky
-tabulate
\ No newline at end of file
+tabulate
+lightgbm>=3.3,<3.4
+catboost>=1.0,<1.1
+xgboost>=1.4,<1.5
+scikit-learn-intelex>=2021.5,<2021.6