diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index fc47c8d44f..41330a78a9 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -43,6 +43,13 @@ module.exports = {
           label: 'GitHub',
           position: 'right',
         },
+        {
+          type: 'docsVersionDropdown',
+          position: 'right',
+
+        },
+
+
       ],
     },
     footer: {
diff --git a/website/versioned_docs/version-1.0.4/Contribute.md b/website/versioned_docs/version-1.0.4/Contribute.md
new file mode 100644
index 0000000000..b0e26c4855
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Contribute.md
@@ -0,0 +1,122 @@
+# Contributing
+
+This project welcomes and encourages all forms of contributions, including but not limited to:
+
+-  Pushing patches.
+-  Code review of pull requests.
+-  Documentation, examples and test cases.
+-  Readability improvement, e.g., improvement on docstr and comments.
+-  Community participation in [issues](https://github.com/microsoft/FLAML/issues), [discussions](https://github.com/microsoft/FLAML/discussions), and [discord](https://discord.gg/7ZVfhbTQZ5).
+-  Tutorials, blog posts, talks that promote the project.
+-  Sharing application scenarios and/or related research.
+
+You can take a look at the [Roadmap for Upcoming Features](https://github.com/microsoft/FLAML/wiki/Roadmap-for-Upcoming-Features) to identify potential things to work on.
+
+Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit <https://cla.opensource.microsoft.com>.
+
+If you are new to GitHub [here](https://help.github.com/categories/collaborating-with-issues-and-pull-requests/) is a detailed help source on getting involved with development on GitHub.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+
+## How to make a good bug report
+
+When you submit an issue to [GitHub](https://github.com/microsoft/FLAML/issues), please do your best to
+follow these guidelines! This will make it a lot easier to provide you with good
+feedback:
+
+- The ideal bug report contains a short reproducible code snippet. This way
+  anyone can try to reproduce the bug easily (see [this](https://stackoverflow.com/help/mcve) for more details). If your snippet is
+  longer than around 50 lines, please link to a [gist](https://gist.github.com) or a GitHub repo.
+
+- If an exception is raised, please **provide the full traceback**.
+
+- Please include your **operating system type and version number**, as well as
+  your **Python, flaml, scikit-learn versions**. The version of flaml
+  can be found by running the following code snippet:
+```python
+import flaml
+print(flaml.__version__)
+```
+
+- Please ensure all **code snippets and error messages are formatted in
+  appropriate code blocks**.  See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks)
+  for more details.
+
+
+## Becoming a Reviewer
+
+There is currently no formal reviewer solicitation process. Current reviewers identify reviewers from active contributors. If you are willing to become a reviewer, you are welcome to let us know on gitter.
+
+## Developing
+
+### Setup
+
+```bash
+git clone https://github.com/microsoft/FLAML.git
+pip install -e FLAML[test,notebook]
+```
+
+In case the `pip install` command fails, try escaping the brackets such as `pip install -e FLAML\[test,notebook\]`
+
+### Docker
+
+We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
+
+```bash
+docker build https://github.com/microsoft/FLAML.git#main -t flaml-dev
+docker run -it flaml-dev
+```
+
+### Develop in Remote Container
+
+If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
+We have provided the configuration in [devcontainer](https://github.com/microsoft/FLAML/blob/main/.devcontainer).
+
+### Pre-commit
+
+Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
+`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.
+
+### Coverage
+
+Any code you commit should not decrease coverage. To run all unit tests:
+
+```bash
+coverage run -m pytest test
+```
+
+Then you can see the coverage report by
+`coverage report -m` or `coverage html`.
+
+### Documentation
+
+To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/). For example,
+
+```bash
+nvm install --lts
+```
+
+Then:
+
+```console
+npm install --global yarn  # skip if you use the dev container we provided
+pip install pydoc-markdown==4.5.0  # skip if you use the dev container we provided
+cd website
+yarn install --frozen-lockfile --ignore-engines
+pydoc-markdown
+yarn start
+```
+
+The last command starts a local development server and opens up a browser window.
+Most changes are reflected live without having to restart the server.
+
+Note:
+some tips in this guide are based off the contributor guide from [ray](https://docs.ray.io/en/latest/ray-contribute/getting-involved.html), [scikit-learn](https://scikit-learn.org/stable/developers/contributing.html), or [hummingbird](https://github.com/microsoft/hummingbird/blob/main/CONTRIBUTING.md).
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-Classification.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-Classification.md
new file mode 100644
index 0000000000..4d472b081a
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-Classification.md
@@ -0,0 +1,62 @@
+# AutoML - Classification
+
+### A basic classification example
+
+```python
+from flaml import AutoML
+from sklearn.datasets import load_iris
+
+# Initialize an AutoML instance
+automl = AutoML()
+# Specify automl goal and constraint
+automl_settings = {
+    "time_budget": 1,  # in seconds
+    "metric": 'accuracy',
+    "task": 'classification',
+    "log_file_name": "iris.log",
+}
+X_train, y_train = load_iris(return_X_y=True)
+# Train with labeled input data
+automl.fit(X_train=X_train, y_train=y_train,
+           **automl_settings)
+# Predict
+print(automl.predict_proba(X_train))
+# Print the best model
+print(automl.model.estimator)
+```
+
+#### Sample of output
+```
+[flaml.automl: 11-12 18:21:44] {1485} INFO - Data split method: stratified
+[flaml.automl: 11-12 18:21:44] {1489} INFO - Evaluation method: cv
+[flaml.automl: 11-12 18:21:44] {1540} INFO - Minimizing error metric: 1-accuracy
+[flaml.automl: 11-12 18:21:44] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 0, current learner lgbm
+[flaml.automl: 11-12 18:21:44] {1944} INFO - Estimated sufficient time budget=1285s. Estimated necessary time budget=23s.
+[flaml.automl: 11-12 18:21:44] {2029} INFO -  at 0.2s,	estimator lgbm's best error=0.0733,	best estimator lgbm's best error=0.0733
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 1, current learner lgbm
+[flaml.automl: 11-12 18:21:44] {2029} INFO -  at 0.3s,	estimator lgbm's best error=0.0733,	best estimator lgbm's best error=0.0733
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 2, current learner lgbm
+[flaml.automl: 11-12 18:21:44] {2029} INFO -  at 0.4s,	estimator lgbm's best error=0.0533,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 3, current learner lgbm
+[flaml.automl: 11-12 18:21:44] {2029} INFO -  at 0.6s,	estimator lgbm's best error=0.0533,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 4, current learner lgbm
+[flaml.automl: 11-12 18:21:44] {2029} INFO -  at 0.6s,	estimator lgbm's best error=0.0533,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:44] {1826} INFO - iteration 5, current learner xgboost
+[flaml.automl: 11-12 18:21:45] {2029} INFO -  at 0.9s,	estimator xgboost's best error=0.0600,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:45] {1826} INFO - iteration 6, current learner lgbm
+[flaml.automl: 11-12 18:21:45] {2029} INFO -  at 1.0s,	estimator lgbm's best error=0.0533,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:45] {1826} INFO - iteration 7, current learner extra_tree
+[flaml.automl: 11-12 18:21:45] {2029} INFO -  at 1.1s,	estimator extra_tree's best error=0.0667,	best estimator lgbm's best error=0.0533
+[flaml.automl: 11-12 18:21:45] {2242} INFO - retrain lgbm for 0.0s
+[flaml.automl: 11-12 18:21:45] {2247} INFO - retrained model: LGBMClassifier(learning_rate=0.2677050123105203, max_bin=127,
+               min_child_samples=12, n_estimators=4, num_leaves=4,
+               reg_alpha=0.001348364934537134, reg_lambda=1.4442580148221913,
+               verbose=-1)
+[flaml.automl: 11-12 18:21:45] {1608} INFO - fit succeeded
+[flaml.automl: 11-12 18:21:45] {1610} INFO - Time taken to find the best model: 0.3756711483001709
+```
+
+### A more advanced example including custom learner and metric
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_classification.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_classification.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-NLP.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-NLP.md
new file mode 100644
index 0000000000..7ef0f6c7a7
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-NLP.md
@@ -0,0 +1,376 @@
+# AutoML - NLP
+
+### Requirements
+
+This example requires GPU. Install the [nlp] option:
+```python
+pip install "flaml[nlp]"
+```
+
+### A simple sequence classification example
+
+```python
+from flaml import AutoML
+from datasets import load_dataset
+
+train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas()
+dev_dataset = load_dataset("glue", "mrpc", split="validation").to_pandas()
+test_dataset = load_dataset("glue", "mrpc", split="test").to_pandas()
+custom_sent_keys = ["sentence1", "sentence2"]
+label_key = "label"
+X_train, y_train = train_dataset[custom_sent_keys], train_dataset[label_key]
+X_val, y_val = dev_dataset[custom_sent_keys], dev_dataset[label_key]
+X_test = test_dataset[custom_sent_keys]
+
+automl = AutoML()
+automl_settings = {
+    "time_budget": 100,
+    "task": "seq-classification",
+    "fit_kwargs_by_estimator": {
+        "transformer":
+       {
+           "output_dir": "data/output/"  # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+       }
+    },  # setting the huggingface arguments: output directory
+    "gpu_per_trial": 1,                         # set to 0 if no GPU is available
+}
+automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
+automl.predict(X_test)
+```
+
+Notice that after you run `automl.fit`, the intermediate checkpoints are saved under the specified output_dir `data/output`. You can use the following code to clean these outputs if they consume a large storage space:
+
+```python
+if os.path.exists("data/output/"):
+    shutil.rmtree("data/output/")
+```
+
+#### Sample output
+
+```
+[flaml.automl: 12-06 08:21:39] {1943} INFO - task = seq-classification
+[flaml.automl: 12-06 08:21:39] {1945} INFO - Data split method: stratified
+[flaml.automl: 12-06 08:21:39] {1949} INFO - Evaluation method: holdout
+[flaml.automl: 12-06 08:21:39] {2019} INFO - Minimizing error metric: 1-accuracy
+[flaml.automl: 12-06 08:21:39] {2071} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 12-06 08:21:39] {2311} INFO - iteration 0, current learner transformer
+{'data/output/train_2021-12-06_08-21-53/train_8947b1b2_1_n=1e-06,s=9223372036854775807,e=1e-05,s=-1,s=0.45765,e=32,d=42,o=0.0,y=0.0_2021-12-06_08-21-53/checkpoint-53': 53}
+[flaml.automl: 12-06 08:22:56] {2424} INFO - Estimated sufficient time budget=766860s. Estimated necessary time budget=767s.
+[flaml.automl: 12-06 08:22:56] {2499} INFO -  at 76.7s, estimator transformer's best error=0.1740,      best estimator transformer's best error=0.1740
+[flaml.automl: 12-06 08:22:56] {2606} INFO - selected model: <flaml.nlp.huggingface.trainer.TrainerForAuto object at 0x7f49ea8414f0>
+[flaml.automl: 12-06 08:22:56] {2100} INFO - fit succeeded
+[flaml.automl: 12-06 08:22:56] {2101} INFO - Time taken to find the best model: 76.69802761077881
+[flaml.automl: 12-06 08:22:56] {2112} WARNING - Time taken to find the best model is 77% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
+
+### A simple sequence regression example
+
+```python
+from flaml import AutoML
+from datasets import load_dataset
+
+train_dataset = (
+    load_dataset("glue", "stsb", split="train").to_pandas()
+)
+dev_dataset = (
+    load_dataset("glue", "stsb", split="train").to_pandas()
+)
+custom_sent_keys = ["sentence1", "sentence2"]
+label_key = "label"
+X_train = train_dataset[custom_sent_keys]
+y_train = train_dataset[label_key]
+X_val = dev_dataset[custom_sent_keys]
+y_val = dev_dataset[label_key]
+
+automl = AutoML()
+automl_settings = {
+    "gpu_per_trial": 0,
+    "time_budget": 20,
+    "task": "seq-regression",
+    "metric": "rmse",
+}
+automl_settings["fit_kwargs_by_estimator"] = {  # setting the huggingface arguments
+    "transformer": {
+        "model_path": "google/electra-small-discriminator", # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+        "output_dir": "data/output/",                       # setting the output directory
+        "fp16": False,
+    }   # setting whether to use FP16
+}
+automl.fit(
+    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 12-20 11:47:28] {1965} INFO - task = seq-regression
+[flaml.automl: 12-20 11:47:28] {1967} INFO - Data split method: uniform
+[flaml.automl: 12-20 11:47:28] {1971} INFO - Evaluation method: holdout
+[flaml.automl: 12-20 11:47:28] {2063} INFO - Minimizing error metric: rmse
+[flaml.automl: 12-20 11:47:28] {2115} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 12-20 11:47:28] {2355} INFO - iteration 0, current learner transformer
+```
+
+### A simple summarization example
+
+```python
+from flaml import AutoML
+from datasets import load_dataset
+
+train_dataset = (
+    load_dataset("xsum", split="train").to_pandas()
+)
+dev_dataset = (
+    load_dataset("xsum", split="validation").to_pandas()
+)
+custom_sent_keys = ["document"]
+label_key = "summary"
+
+X_train = train_dataset[custom_sent_keys]
+y_train = train_dataset[label_key]
+
+X_val = dev_dataset[custom_sent_keys]
+y_val = dev_dataset[label_key]
+
+automl = AutoML()
+automl_settings = {
+    "gpu_per_trial": 1,
+    "time_budget": 20,
+    "task": "summarization",
+    "metric": "rouge1",
+}
+automl_settings["fit_kwargs_by_estimator"] = {      # setting the huggingface arguments
+    "transformer": {
+        "model_path": "t5-small",             # if model_path is not set, the default model is t5-small: https://huggingface.co/t5-small
+        "output_dir": "data/output/",         # setting the output directory
+        "fp16": False,
+    } # setting whether to use FP16
+}
+automl.fit(
+    X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+)
+```
+#### Sample Output
+
+```
+[flaml.automl: 12-20 11:44:03] {1965} INFO - task = summarization
+[flaml.automl: 12-20 11:44:03] {1967} INFO - Data split method: uniform
+[flaml.automl: 12-20 11:44:03] {1971} INFO - Evaluation method: holdout
+[flaml.automl: 12-20 11:44:03] {2063} INFO - Minimizing error metric: -rouge
+[flaml.automl: 12-20 11:44:03] {2115} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 12-20 11:44:03] {2355} INFO - iteration 0, current learner transformer
+loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/xliu127/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
+Model config T5Config {
+  "_name_or_path": "t5-small",
+  "architectures": [
+    "T5WithLMHeadModel"
+  ],
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "transformers_version": "4.14.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}
+```
+
+### A simple token classification example
+
+There are two ways to define the label for a token classification task. The first is to define the token labels:
+
+```python
+from flaml import AutoML
+import pandas as pd
+
+train_dataset = {
+    "id": ["0", "1"],
+    "ner_tags": [
+        ["B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O"],
+        ["B-PER", "I-PER"],
+    ],
+    "tokens": [
+        [
+            "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".",
+        ],
+        ["Peter", "Blackburn"],
+    ],
+}
+dev_dataset = {
+    "id": ["0"],
+    "ner_tags": [
+        ["O"],
+    ],
+    "tokens": [
+        ["1996-08-22"]
+    ],
+}
+test_dataset = {
+    "id": ["0"],
+    "ner_tags": [
+        ["O"],
+    ],
+    "tokens": [
+        ['.']
+    ],
+}
+custom_sent_keys = ["tokens"]
+label_key = "ner_tags"
+
+train_dataset = pd.DataFrame(train_dataset)
+dev_dataset = pd.DataFrame(dev_dataset)
+test_dataset = pd.DataFrame(test_dataset)
+
+X_train, y_train = train_dataset[custom_sent_keys], train_dataset[label_key]
+X_val, y_val = dev_dataset[custom_sent_keys], dev_dataset[label_key]
+X_test = test_dataset[custom_sent_keys]
+
+automl = AutoML()
+automl_settings = {
+    "time_budget": 10,
+    "task": "token-classification",
+    "fit_kwargs_by_estimator": {
+        "transformer":
+            {
+                "output_dir": "data/output/"
+                # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+            }
+    },  # setting the huggingface arguments: output directory
+    "gpu_per_trial": 1,  # set to 0 if no GPU is available
+    "metric": "seqeval:overall_f1"
+}
+
+automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
+automl.predict(X_test)
+```
+
+The second is to define the id labels + a token [label list](https://microsoft.github.io/FLAML/docs/reference/nlp/huggingface/training_args):
+
+```python
+from flaml import AutoML
+import pandas as pd
+
+train_dataset = {
+        "id": ["0", "1"],
+        "ner_tags": [
+            [3, 0, 7, 0, 0, 0, 7, 0, 0],
+            [1, 2],
+        ],
+        "tokens": [
+            [
+                "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".",
+            ],
+            ["Peter", "Blackburn"],
+        ],
+    }
+dev_dataset = {
+    "id": ["0"],
+    "ner_tags": [
+        [0],
+    ],
+    "tokens": [
+        ["1996-08-22"]
+    ],
+}
+test_dataset = {
+    "id": ["0"],
+    "ner_tags": [
+        [0],
+    ],
+    "tokens": [
+        ['.']
+    ],
+}
+custom_sent_keys = ["tokens"]
+label_key = "ner_tags"
+
+train_dataset = pd.DataFrame(train_dataset)
+dev_dataset = pd.DataFrame(dev_dataset)
+test_dataset = pd.DataFrame(test_dataset)
+
+X_train, y_train = train_dataset[custom_sent_keys], train_dataset[label_key]
+X_val, y_val = dev_dataset[custom_sent_keys], dev_dataset[label_key]
+X_test = test_dataset[custom_sent_keys]
+
+automl = AutoML()
+automl_settings = {
+    "time_budget": 10,
+    "task": "token-classification",
+    "fit_kwargs_by_estimator": {
+        "transformer":
+            {
+                "output_dir": "data/output/",
+                # if model_path is not set, the default model is facebook/muppet-roberta-base: https://huggingface.co/facebook/muppet-roberta-base
+                "label_list": [ "O","B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC" ]
+            }
+    },  # setting the huggingface arguments: output directory
+    "gpu_per_trial": 1,  # set to 0 if no GPU is available
+    "metric": "seqeval:overall_f1"
+}
+
+automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)
+automl.predict(X_test)
+```
+
+#### Sample Output
+
+```
+[flaml.automl: 06-30 03:10:02] {2423} INFO - task = token-classification
+[flaml.automl: 06-30 03:10:02] {2425} INFO - Data split method: stratified
+[flaml.automl: 06-30 03:10:02] {2428} INFO - Evaluation method: holdout
+[flaml.automl: 06-30 03:10:02] {2497} INFO - Minimizing error metric: seqeval:overall_f1
+[flaml.automl: 06-30 03:10:02] {2637} INFO - List of ML learners in AutoML Run: ['transformer']
+[flaml.automl: 06-30 03:10:02] {2929} INFO - iteration 0, current learner transformer
+```
+
+For tasks that are not currently supported, use `flaml.tune` for [customized tuning](Tune-HuggingFace).
+
+### Link to Jupyter notebook
+
+To run more examples, especially examples using Ray Tune, please go to:
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_nlp.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_nlp.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-Rank.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-Rank.md
new file mode 100644
index 0000000000..a145f225e3
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-Rank.md
@@ -0,0 +1,96 @@
+# AutoML - Rank
+
+### A simple learning-to-rank example
+
+```python
+from sklearn.datasets import fetch_openml
+from flaml import AutoML
+
+X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
+y_train = y_train.cat.codes
+# not a real learning to rank dataaset
+groups = [200] * 4 + [100] * 2    # group counts
+automl = AutoML()
+automl.fit(
+    X_train, y_train, groups=groups,
+    task='rank', time_budget=10,    # in seconds
+)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 11-15 07:14:30] {1485} INFO - Data split method: group
+[flaml.automl: 11-15 07:14:30] {1489} INFO - Evaluation method: holdout
+[flaml.automl: 11-15 07:14:30] {1540} INFO - Minimizing error metric: 1-ndcg
+[flaml.automl: 11-15 07:14:30] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost']
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 0, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {1944} INFO - Estimated sufficient time budget=679s. Estimated necessary time budget=1s.
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.1s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 1, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.1s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 2, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.2s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 3, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.2s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 4, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.2s,  estimator xgboost's best error=0.0315,  best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 5, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.2s,  estimator xgboost's best error=0.0315,  best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 6, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.3s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 7, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.3s,  estimator lgbm's best error=0.0248,     best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 8, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0315,  best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 9, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0315,  best estimator lgbm's best error=0.0248
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 10, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0233,  best estimator xgboost's best error=0.0233
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 11, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0233,  best estimator xgboost's best error=0.0233
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 12, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0233,  best estimator xgboost's best error=0.0233
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 13, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.4s,  estimator xgboost's best error=0.0233,  best estimator xgboost's best error=0.0233
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 14, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.5s,  estimator lgbm's best error=0.0225,     best estimator lgbm's best error=0.0225
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 15, current learner xgboost
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.5s,  estimator xgboost's best error=0.0233,  best estimator lgbm's best error=0.0225
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 16, current learner lgbm
+[flaml.automl: 11-15 07:14:30] {2029} INFO -  at 0.5s,  estimator lgbm's best error=0.0225,     best estimator lgbm's best error=0.0225
+[flaml.automl: 11-15 07:14:30] {1826} INFO - iteration 17, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.5s,  estimator lgbm's best error=0.0225,     best estimator lgbm's best error=0.0225
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 18, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.6s,  estimator lgbm's best error=0.0225,     best estimator lgbm's best error=0.0225
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 19, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.6s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 20, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.6s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 21, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.7s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 22, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.7s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 23, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.8s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 24, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.8s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 25, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.8s,  estimator lgbm's best error=0.0201,     best estimator lgbm's best error=0.0201
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 26, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.9s,  estimator lgbm's best error=0.0197,     best estimator lgbm's best error=0.0197
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 27, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 0.9s,  estimator lgbm's best error=0.0197,     best estimator lgbm's best error=0.0197
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 28, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 1.0s,  estimator lgbm's best error=0.0197,     best estimator lgbm's best error=0.0197
+[flaml.automl: 11-15 07:14:31] {1826} INFO - iteration 29, current learner lgbm
+[flaml.automl: 11-15 07:14:31] {2029} INFO -  at 1.0s,  estimator lgbm's best error=0.0197,     best estimator lgbm's best error=0.0197
+[flaml.automl: 11-15 07:14:31] {2242} INFO - retrain lgbm for 0.0s
+[flaml.automl: 11-15 07:14:31] {2247} INFO - retrained model: LGBMRanker(colsample_bytree=0.9852774042640857,
+           learning_rate=0.034918421933217675, max_bin=1023,
+           min_child_samples=22, n_estimators=6, num_leaves=23,
+           reg_alpha=0.0009765625, reg_lambda=21.505295697527654, verbose=-1)
+[flaml.automl: 11-15 07:14:31] {1608} INFO - fit succeeded
+[flaml.automl: 11-15 07:14:31] {1610} INFO - Time taken to find the best model: 0.8846545219421387
+[flaml.automl: 11-15 07:14:31] {1624} WARNING - Time taken to find the best model is 88% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-Regression.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-Regression.md
new file mode 100644
index 0000000000..1c2032958d
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-Regression.md
@@ -0,0 +1,101 @@
+# AutoML - Regression
+
+### A basic regression example
+
+```python
+from flaml import AutoML
+from sklearn.datasets import fetch_california_housing
+
+# Initialize an AutoML instance
+automl = AutoML()
+# Specify automl goal and constraint
+automl_settings = {
+    "time_budget": 1,  # in seconds
+    "metric": 'r2',
+    "task": 'regression',
+    "log_file_name": "california.log",
+}
+X_train, y_train = fetch_california_housing(return_X_y=True)
+# Train with labeled input data
+automl.fit(X_train=X_train, y_train=y_train,
+           **automl_settings)
+# Predict
+print(automl.predict(X_train))
+# Print the best model
+print(automl.model.estimator)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 11-15 07:08:19] {1485} INFO - Data split method: uniform
+[flaml.automl: 11-15 07:08:19] {1489} INFO - Evaluation method: holdout
+[flaml.automl: 11-15 07:08:19] {1540} INFO - Minimizing error metric: 1-r2
+[flaml.automl: 11-15 07:08:19] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 0, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {1944} INFO - Estimated sufficient time budget=846s. Estimated necessary time budget=2s.
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.2s,  estimator lgbm's best error=0.7393,     best estimator lgbm's best error=0.7393
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 1, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.3s,  estimator lgbm's best error=0.7393,     best estimator lgbm's best error=0.7393
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 2, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.3s,  estimator lgbm's best error=0.5446,     best estimator lgbm's best error=0.5446
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 3, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.4s,  estimator lgbm's best error=0.2807,     best estimator lgbm's best error=0.2807
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 4, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.5s,  estimator lgbm's best error=0.2712,     best estimator lgbm's best error=0.2712
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 5, current learner lgbm
+[flaml.automl: 11-15 07:08:19] {2029} INFO -  at 0.5s,  estimator lgbm's best error=0.2712,     best estimator lgbm's best error=0.2712
+[flaml.automl: 11-15 07:08:19] {1826} INFO - iteration 6, current learner lgbm
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.6s,  estimator lgbm's best error=0.2712,     best estimator lgbm's best error=0.2712
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 7, current learner lgbm
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.7s,  estimator lgbm's best error=0.2197,     best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 8, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.8s,  estimator xgboost's best error=1.4958,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 9, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.8s,  estimator xgboost's best error=1.4958,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 10, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.9s,  estimator xgboost's best error=0.7052,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 11, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.9s,  estimator xgboost's best error=0.3619,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 12, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 0.9s,  estimator xgboost's best error=0.3619,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 13, current learner xgboost
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 1.0s,  estimator xgboost's best error=0.3619,  best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {1826} INFO - iteration 14, current learner extra_tree
+[flaml.automl: 11-15 07:08:20] {2029} INFO -  at 1.1s,  estimator extra_tree's best error=0.7197,       best estimator lgbm's best error=0.2197
+[flaml.automl: 11-15 07:08:20] {2242} INFO - retrain lgbm for 0.0s
+[flaml.automl: 11-15 07:08:20] {2247} INFO - retrained model: LGBMRegressor(colsample_bytree=0.7610534336273627,
+              learning_rate=0.41929025492645006, max_bin=255,
+              min_child_samples=4, n_estimators=45, num_leaves=4,
+              reg_alpha=0.0009765625, reg_lambda=0.009280655005879943,
+              verbose=-1)
+[flaml.automl: 11-15 07:08:20] {1608} INFO - fit succeeded
+[flaml.automl: 11-15 07:08:20] {1610} INFO - Time taken to find the best model: 0.7289648056030273
+[flaml.automl: 11-15 07:08:20] {1624} WARNING - Time taken to find the best model is 73% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
+
+### Multi-output regression
+
+We can combine `sklearn.MultiOutputRegressor` and `flaml.AutoML` to do AutoML for multi-output regression.
+
+```python
+from flaml import AutoML
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.multioutput import MultiOutputRegressor
+
+# create regression data
+X, y = make_regression(n_targets=3)
+
+# split into train and test data
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
+
+# train the model
+model = MultiOutputRegressor(AutoML(task="regression", time_budget=60))
+model.fit(X_train, y_train)
+
+# predict
+print(model.predict(X_test))
+```
+
+It will perform AutoML for each target, each taking 60 seconds.
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-Time series forecast.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-Time series forecast.md
new file mode 100644
index 0000000000..526eb3f783
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-Time series forecast.md	
@@ -0,0 +1,1555 @@
+# AutoML - Time Series Forecast
+
+### Prerequisites
+
+Install the [ts_forecast] option.
+```bash
+pip install "flaml[ts_forecast]"
+```
+
+### Simple NumPy Example
+
+```python
+import numpy as np
+from flaml import AutoML
+
+X_train = np.arange('2014-01', '2022-01', dtype='datetime64[M]')
+y_train = np.random.random(size=84)
+automl = AutoML()
+automl.fit(X_train=X_train[:84],  # a single column of timestamp
+           y_train=y_train,  # value for each timestamp
+           period=12,  # time horizon to forecast, e.g., 12 months
+           task='ts_forecast', time_budget=15,  # time budget in seconds
+           log_file_name="ts_forecast.log",
+           eval_method="holdout",
+          )
+print(automl.predict(X_train[84:]))
+```
+
+#### Sample output
+
+```
+[flaml.automl: 01-21 08:01:20] {2018} INFO - task = ts_forecast
+[flaml.automl: 01-21 08:01:20] {2020} INFO - Data split method: time
+[flaml.automl: 01-21 08:01:20] {2024} INFO - Evaluation method: holdout
+[flaml.automl: 01-21 08:01:20] {2124} INFO - Minimizing error metric: mape
+[flaml.automl: 01-21 08:01:21] {2181} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'prophet', 'arima', 'sarimax']
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 0, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2547} INFO - Estimated sufficient time budget=1429s. Estimated necessary time budget=1s.
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 0.9s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 1, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 0.9s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 2, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 0.9s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 3, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 4, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 5, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9811,     best estimator lgbm's best error=0.9811
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 6, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9652,     best estimator lgbm's best error=0.9652
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 7, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 8, current learner lgbm
+[flaml.automl: 01-21 08:01:21] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:21] {2434} INFO - iteration 9, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 10, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 11, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 12, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 13, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 14, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 15, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 16, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 17, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 18, current learner rf
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.2s,  estimator rf's best error=1.0994,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 19, current learner rf
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.2s,  estimator rf's best error=1.0848,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 20, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator xgboost's best error=1.0271,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 21, current learner rf
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator rf's best error=1.0848,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 22, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 23, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 24, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 25, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.3s,  estimator extra_tree's best error=1.0130,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 26, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.4s,  estimator extra_tree's best error=1.0130,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 27, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.4s,  estimator extra_tree's best error=1.0130,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 28, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.4s,  estimator extra_tree's best error=1.0130,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 29, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.4s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 30, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 31, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 32, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 33, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 34, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 35, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.5s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 36, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.6s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 37, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.6s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 38, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.6s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 39, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.6s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 40, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.6s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 41, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.7s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 42, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.7s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 43, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.7s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 44, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.7s,  estimator xgb_limitdepth's best error=1.5815,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 45, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.8s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 46, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.8s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 47, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.8s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 48, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.9s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 49, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.9s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 50, current learner extra_tree
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.9s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 51, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 1.9s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 52, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 53, current learner xgboost
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 54, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 55, current learner lgbm
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 56, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 57, current learner rf
+[flaml.automl: 01-21 08:01:22] {2594} INFO -  at 2.0s,  estimator rf's best error=1.0848,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:22] {2434} INFO - iteration 58, current learner xgboost
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.1s,  estimator xgboost's best error=1.0015,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 59, current learner extra_tree
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.1s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 60, current learner lgbm
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 61, current learner extra_tree
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.1s,  estimator extra_tree's best error=0.9499,       best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 62, current learner lgbm
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.1s,  estimator lgbm's best error=0.9466,     best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 63, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:23] {2594} INFO -  at 2.2s,  estimator xgb_limitdepth's best error=0.9683,   best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:23] {2434} INFO - iteration 64, current learner prophet
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.2s,  estimator prophet's best error=1.5706,  best estimator lgbm's best error=0.9466
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 65, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.2s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 66, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.4s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 67, current learner sarimax
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.4s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 68, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.5s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 69, current learner sarimax
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.6s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 70, current learner sarimax
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.6s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 71, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.6s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 72, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.6s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 73, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.7s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 74, current learner sarimax
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.7s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 75, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.8s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 76, current learner sarimax
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 4.9s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 77, current learner arima
+[flaml.automl: 01-21 08:01:25] {2594} INFO -  at 5.0s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:25] {2434} INFO - iteration 78, current learner sarimax
+[flaml.automl: 01-21 08:01:26] {2594} INFO -  at 5.1s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:26] {2434} INFO - iteration 79, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:26] {2594} INFO -  at 5.1s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:26] {2434} INFO - iteration 80, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:26] {2594} INFO -  at 5.1s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:26] {2434} INFO - iteration 81, current learner sarimax
+[flaml.automl: 01-21 08:01:26] {2594} INFO -  at 5.1s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:26] {2434} INFO - iteration 82, current learner prophet
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.6s,  estimator prophet's best error=1.4076,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 83, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.6s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 84, current learner sarimax
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.6s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 85, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.6s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 86, current learner sarimax
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.8s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 87, current learner arima
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.8s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 88, current learner sarimax
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.9s,  estimator sarimax's best error=0.5693,  best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 89, current learner arima
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 6.9s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 90, current learner arima
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 7.0s,  estimator arima's best error=0.5693,    best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 91, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 7.0s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 92, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:27] {2594} INFO -  at 7.0s,  estimator xgb_limitdepth's best error=0.9683,   best estimator arima's best error=0.5693
+[flaml.automl: 01-21 08:01:27] {2434} INFO - iteration 93, current learner sarimax
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.0s,  estimator sarimax's best error=0.5600,  best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 94, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.1s,  estimator xgb_limitdepth's best error=0.9683,   best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 95, current learner sarimax
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.2s,  estimator sarimax's best error=0.5600,  best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 96, current learner arima
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.2s,  estimator arima's best error=0.5693,    best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 97, current learner arima
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.2s,  estimator arima's best error=0.5693,    best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 98, current learner extra_tree
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.3s,  estimator extra_tree's best error=0.9499,       best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 99, current learner sarimax
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.3s,  estimator sarimax's best error=0.5600,  best estimator sarimax's best error=0.5600
+[flaml.automl: 01-21 08:01:28] {2434} INFO - iteration 100, current learner xgb_limitdepth
+[flaml.automl: 01-21 08:01:28] {2594} INFO -  at 7.3s,  estimator xgb_limitdepth's best error=0.9683,   best estimator sarimax's best error=0.5600
+```
+
+### Univariate time series
+
+```python
+import statsmodels.api as sm
+
+data = sm.datasets.co2.load_pandas().data
+# data is given in weeks, but the task is to predict monthly, so use monthly averages instead
+data = data['co2'].resample('MS').mean()
+data = data.bfill().ffill()  # makes sure there are no missing values
+data = data.to_frame().reset_index()
+num_samples = data.shape[0]
+time_horizon = 12
+split_idx = num_samples - time_horizon
+train_df = data[:split_idx]  # train_df is a dataframe with two columns: timestamp and label
+X_test = data[split_idx:]['index'].to_frame()  # X_test is a dataframe with dates for prediction
+y_test = data[split_idx:]['co2']  # y_test is a series of the values corresponding to the dates for prediction
+
+from flaml import AutoML
+
+automl = AutoML()
+settings = {
+    "time_budget": 10,  # total running time in seconds
+    "metric": 'mape',  # primary metric for validation: 'mape' is generally used for forecast tasks
+    "task": 'ts_forecast',  # task type
+    "log_file_name": 'CO2_forecast.log',  # flaml log file
+    "eval_method": "holdout",  # validation method can be chosen from ['auto', 'holdout', 'cv']
+    "seed": 7654321,  # random seed
+}
+
+automl.fit(dataframe=train_df,  # training data
+           label='co2',  # label column
+           period=time_horizon,  # key word argument 'period' must be included for forecast task)
+           **settings)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 01-21 07:54:04] {2018} INFO - task = ts_forecast
+[flaml.automl: 01-21 07:54:04] {2020} INFO - Data split method: time
+[flaml.automl: 01-21 07:54:04] {2024} INFO - Evaluation method: holdout
+[flaml.automl: 01-21 07:54:04] {2124} INFO - Minimizing error metric: mape
+Importing plotly failed. Interactive plots will not work.
+[flaml.automl: 01-21 07:54:04] {2181} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'prophet', 'arima', 'sarimax']
+[flaml.automl: 01-21 07:54:04] {2434} INFO - iteration 0, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2547} INFO - Estimated sufficient time budget=2145s. Estimated necessary time budget=2s.
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 0.9s,  estimator lgbm's best error=0.0621,     best estimator lgbm's best error=0.0621
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 1, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.0574,     best estimator lgbm's best error=0.0574
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 2, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.0464,     best estimator lgbm's best error=0.0464
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 3, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.0464,     best estimator lgbm's best error=0.0464
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 4, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.0s,  estimator lgbm's best error=0.0365,     best estimator lgbm's best error=0.0365
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 5, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.0192,     best estimator lgbm's best error=0.0192
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 6, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.0192,     best estimator lgbm's best error=0.0192
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 7, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.1s,  estimator lgbm's best error=0.0192,     best estimator lgbm's best error=0.0192
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 8, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.0110,     best estimator lgbm's best error=0.0110
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 9, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.0110,     best estimator lgbm's best error=0.0110
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 10, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.2s,  estimator lgbm's best error=0.0036,     best estimator lgbm's best error=0.0036
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 11, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.4s,  estimator lgbm's best error=0.0023,     best estimator lgbm's best error=0.0023
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 12, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.4s,  estimator lgbm's best error=0.0023,     best estimator lgbm's best error=0.0023
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 13, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.5s,  estimator lgbm's best error=0.0021,     best estimator lgbm's best error=0.0021
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 14, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.6s,  estimator lgbm's best error=0.0021,     best estimator lgbm's best error=0.0021
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 15, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.7s,  estimator lgbm's best error=0.0020,     best estimator lgbm's best error=0.0020
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 16, current learner lgbm
+[flaml.automl: 01-21 07:54:05] {2594} INFO -  at 1.8s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:05] {2434} INFO - iteration 17, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 1.9s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 18, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.0s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 19, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.1s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 20, current learner rf
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.1s,  estimator rf's best error=0.0228,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 21, current learner rf
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.1s,  estimator rf's best error=0.0210,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 22, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.2s,  estimator xgboost's best error=0.6738,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 23, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.2s,  estimator xgboost's best error=0.6738,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 24, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.2s,  estimator xgboost's best error=0.1717,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 25, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.3s,  estimator xgboost's best error=0.0249,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 26, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.3s,  estimator xgboost's best error=0.0249,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 27, current learner xgboost
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.3s,  estimator xgboost's best error=0.0242,  best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 28, current learner extra_tree
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.4s,  estimator extra_tree's best error=0.0245,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 29, current learner extra_tree
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.4s,  estimator extra_tree's best error=0.0160,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 30, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.5s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 31, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.6s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 32, current learner rf
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.6s,  estimator rf's best error=0.0210,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 33, current learner extra_tree
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.6s,  estimator extra_tree's best error=0.0160,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 34, current learner lgbm
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.8s,  estimator lgbm's best error=0.0017,     best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 35, current learner extra_tree
+[flaml.automl: 01-21 07:54:06] {2594} INFO -  at 2.8s,  estimator extra_tree's best error=0.0158,       best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:06] {2434} INFO - iteration 36, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 2.8s,  estimator xgb_limitdepth's best error=0.0447,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 37, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 2.9s,  estimator xgb_limitdepth's best error=0.0447,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 38, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 2.9s,  estimator xgb_limitdepth's best error=0.0029,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 39, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 3.0s,  estimator xgb_limitdepth's best error=0.0018,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 40, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 3.1s,  estimator xgb_limitdepth's best error=0.0018,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 41, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 3.1s,  estimator xgb_limitdepth's best error=0.0018,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 42, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:07] {2594} INFO -  at 3.3s,  estimator xgb_limitdepth's best error=0.0018,   best estimator lgbm's best error=0.0017
+[flaml.automl: 01-21 07:54:07] {2434} INFO - iteration 43, current learner prophet
+[flaml.automl: 01-21 07:54:09] {2594} INFO -  at 5.5s,  estimator prophet's best error=0.0008,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:09] {2434} INFO - iteration 44, current learner arima
+[flaml.automl: 01-21 07:54:10] {2594} INFO -  at 6.1s,  estimator arima's best error=0.0047,    best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:10] {2434} INFO - iteration 45, current learner sarimax
+[flaml.automl: 01-21 07:54:10] {2594} INFO -  at 6.4s,  estimator sarimax's best error=0.0047,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:10] {2434} INFO - iteration 46, current learner lgbm
+[flaml.automl: 01-21 07:54:10] {2594} INFO -  at 6.5s,  estimator lgbm's best error=0.0017,     best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:10] {2434} INFO - iteration 47, current learner sarimax
+[flaml.automl: 01-21 07:54:10] {2594} INFO -  at 6.6s,  estimator sarimax's best error=0.0047,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:10] {2434} INFO - iteration 48, current learner sarimax
+[flaml.automl: 01-21 07:54:11] {2594} INFO -  at 6.9s,  estimator sarimax's best error=0.0047,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:11] {2434} INFO - iteration 49, current learner arima
+[flaml.automl: 01-21 07:54:11] {2594} INFO -  at 6.9s,  estimator arima's best error=0.0047,    best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:11] {2434} INFO - iteration 50, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:11] {2594} INFO -  at 7.0s,  estimator xgb_limitdepth's best error=0.0018,   best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:11] {2434} INFO - iteration 51, current learner sarimax
+[flaml.automl: 01-21 07:54:11] {2594} INFO -  at 7.5s,  estimator sarimax's best error=0.0047,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:11] {2434} INFO - iteration 52, current learner xgboost
+[flaml.automl: 01-21 07:54:11] {2594} INFO -  at 7.6s,  estimator xgboost's best error=0.0242,  best estimator prophet's best error=0.0008
+[flaml.automl: 01-21 07:54:11] {2434} INFO - iteration 53, current learner prophet
+[flaml.automl: 01-21 07:54:13] {2594} INFO -  at 9.3s,  estimator prophet's best error=0.0005,  best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:13] {2434} INFO - iteration 54, current learner sarimax
+[flaml.automl: 01-21 07:54:13] {2594} INFO -  at 9.4s,  estimator sarimax's best error=0.0047,  best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:13] {2434} INFO - iteration 55, current learner xgb_limitdepth
+[flaml.automl: 01-21 07:54:13] {2594} INFO -  at 9.8s,  estimator xgb_limitdepth's best error=0.0018,   best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:13] {2434} INFO - iteration 56, current learner xgboost
+[flaml.automl: 01-21 07:54:13] {2594} INFO -  at 9.8s,  estimator xgboost's best error=0.0242,  best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:13] {2434} INFO - iteration 57, current learner lgbm
+[flaml.automl: 01-21 07:54:14] {2594} INFO -  at 9.9s,  estimator lgbm's best error=0.0017,     best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:14] {2434} INFO - iteration 58, current learner rf
+[flaml.automl: 01-21 07:54:14] {2594} INFO -  at 10.0s, estimator rf's best error=0.0146,       best estimator prophet's best error=0.0005
+[flaml.automl: 01-21 07:54:14] {2824} INFO - retrain prophet for 0.6s
+[flaml.automl: 01-21 07:54:14] {2831} INFO - retrained model: <prophet.forecaster.Prophet object at 0x7fb68ea65d60>
+[flaml.automl: 01-21 07:54:14] {2210} INFO - fit succeeded
+[flaml.automl: 01-21 07:54:14] {2211} INFO - Time taken to find the best model: 9.339771270751953
+[flaml.automl: 01-21 07:54:14] {2222} WARNING - Time taken to find the best model is 93% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
+
+#### Compute and plot predictions
+
+The example plotting code requires matplotlib.
+
+```python
+flaml_y_pred = automl.predict(X_test)
+import matplotlib.pyplot as plt
+
+plt.plot(X_test, y_test, label='Actual level')
+plt.plot(X_test, flaml_y_pred, label='FLAML forecast')
+plt.xlabel('Date')
+plt.ylabel('CO2 Levels')
+plt.legend()
+```
+
+![png](images/CO2.png)
+
+### Multivariate Time Series (Forecasting with Exogenous Variables)
+```python
+import pandas as pd
+
+# pd.set_option("display.max_rows", None, "display.max_columns", None)
+multi_df = pd.read_csv(
+    "https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/nyc_energy_consumption.csv"
+)
+
+# preprocessing data
+multi_df["timeStamp"] = pd.to_datetime(multi_df["timeStamp"])
+multi_df = multi_df.set_index("timeStamp")
+multi_df = multi_df.resample("D").mean()
+multi_df["temp"] = multi_df["temp"].fillna(method="ffill")
+multi_df["precip"] = multi_df["precip"].fillna(method="ffill")
+multi_df = multi_df[:-2]  # last two rows are NaN for 'demand' column so remove them
+multi_df = multi_df.reset_index()
+
+# Using temperature values create categorical values
+# where 1 denotes daily tempurature is above monthly average and 0 is below.
+def get_monthly_avg(data):
+    data["month"] = data["timeStamp"].dt.month
+    data = data[["month", "temp"]].groupby("month")
+    data = data.agg({"temp": "mean"})
+    return data
+
+monthly_avg = get_monthly_avg(multi_df).to_dict().get("temp")
+
+def above_monthly_avg(date, temp):
+    month = date.month
+    if temp > monthly_avg.get(month):
+        return 1
+    else:
+        return 0
+
+multi_df["temp_above_monthly_avg"] = multi_df.apply(
+    lambda x: above_monthly_avg(x["timeStamp"], x["temp"]), axis=1
+)
+
+del multi_df["month"]  # remove temperature column to reduce redundancy
+
+# split data into train and test
+num_samples = multi_df.shape[0]
+multi_time_horizon = 180
+split_idx = num_samples - multi_time_horizon
+multi_train_df = multi_df[:split_idx]
+multi_test_df = multi_df[split_idx:]
+
+multi_X_test = multi_test_df[
+    ["timeStamp", "precip", "temp", "temp_above_monthly_avg"]
+]  # test dataframe must contain values for the regressors / multivariate variables
+multi_y_test = multi_test_df["demand"]
+
+# initialize AutoML instance
+automl = AutoML()
+
+# configure AutoML settings
+settings = {
+    "time_budget": 10,  # total running time in seconds
+    "metric": "mape",  # primary metric
+    "task": "ts_forecast",  # task type
+    "log_file_name": "energy_forecast_categorical.log",  # flaml log file
+    "eval_method": "holdout",
+    "log_type": "all",
+    "label": "demand",
+}
+
+# train the model
+automl.fit(dataframe=df, **settings, period=time_horizon)
+
+# predictions
+print(automl.predict(multi_X_test))
+```
+
+#### Sample Output
+
+```
+[flaml.automl: 08-13 01:03:11] {2540} INFO - task = ts_forecast
+[flaml.automl: 08-13 01:03:11] {2542} INFO - Data split method: time
+[flaml.automl: 08-13 01:03:11] {2545} INFO - Evaluation method: holdout
+[flaml.automl: 08-13 01:03:11] {2664} INFO - Minimizing error metric: mape
+[flaml.automl: 08-13 01:03:12] {2806} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'prophet', 'arima', 'sarimax']
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 0, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3241} INFO - Estimated sufficient time budget=7681s. Estimated necessary time budget=8s.
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 0.8s,	estimator lgbm's best error=0.0854,	best estimator lgbm's best error=0.0854
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 1, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 0.9s,	estimator lgbm's best error=0.0854,	best estimator lgbm's best error=0.0854
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 2, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 0.9s,	estimator lgbm's best error=0.0525,	best estimator lgbm's best error=0.0525
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 3, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 0.9s,	estimator lgbm's best error=0.0525,	best estimator lgbm's best error=0.0525
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 4, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 1.0s,	estimator lgbm's best error=0.0406,	best estimator lgbm's best error=0.0406
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 5, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 1.0s,	estimator lgbm's best error=0.0406,	best estimator lgbm's best error=0.0406
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 6, current learner lgbm
+[flaml.automl: 08-13 01:03:12] {3288} INFO -  at 1.0s,	estimator lgbm's best error=0.0406,	best estimator lgbm's best error=0.0406
+[flaml.automl: 08-13 01:03:12] {3108} INFO - iteration 7, current learner lgbm
+[flaml.automl: 08-13 01:03:13] {3288} INFO -  at 1.1s,	estimator lgbm's best error=0.0393,	best estimator lgbm's best error=0.0393
+[flaml.automl: 08-13 01:03:13] {3108} INFO - iteration 8, current learner lgbm
+[flaml.automl: 08-13 01:03:13] {3288} INFO -  at 1.1s,	estimator lgbm's best error=0.0393,	best estimator lgbm's best error=0.0393
+[flaml.automl: 08-13 01:03:13] {3108} INFO - iteration 9, current learner lgbm
+...
+              silent=True, subsample=1.0, subsample_for_bin=200000,
+              subsample_freq=0, verbose=-1)
+[flaml.automl: 08-13 01:03:22] {2837} INFO - fit succeeded
+[flaml.automl: 08-13 01:03:22] {2838} INFO - Time taken to find the best model: 3.4941744804382324
+```
+
+### Forecasting Discrete Variables
+```python
+from hcrystalball.utils import get_sales_data
+import numpy as np
+from flaml import AutoML
+
+time_horizon = 30
+df = get_sales_data(n_dates=180, n_assortments=1, n_states=1, n_stores=1)
+df = df[["Sales", "Open", "Promo", "Promo2"]]
+
+# feature engineering - create a discrete value column
+# 1 denotes above mean and 0 denotes below mean
+df["above_mean_sales"] = np.where(df["Sales"] > df["Sales"].mean(), 1, 0)
+df.reset_index(inplace=True)
+
+# train-test split
+discrete_train_df = df[:-time_horizon]
+discrete_test_df = df[-time_horizon:]
+discrete_X_train, discrete_X_test = (
+    discrete_train_df[["Date", "Open", "Promo", "Promo2"]],
+    discrete_test_df[["Date", "Open", "Promo", "Promo2"]],
+)
+discrete_y_train, discrete_y_test = discrete_train_df["above_mean_sales"], discrete_test_df["above_mean_sales"]
+
+# initialize AutoML instance
+automl = AutoML()
+
+# configure the settings
+settings = {
+    "time_budget": 15,  # total running time in seconds
+    "metric": "accuracy",  # primary metric
+    "task": "ts_forecast_classification",  # task type
+    "log_file_name": "sales_classification_forecast.log",  # flaml log file
+    "eval_method": "holdout",
+}
+
+# train the model
+automl.fit(X_train=discrete_X_train,
+           y_train=discrete_y_train,
+           **settings,
+           period=time_horizon)
+
+# make predictions
+discrete_y_pred = automl.predict(discrete_X_test)
+print("Predicted label", discrete_y_pred)
+print("True label", discrete_y_test)
+```
+
+#### Sample Output
+
+```
+[flaml.automl: 02-28 21:53:03] {2060} INFO - task = ts_forecast_classification
+[flaml.automl: 02-28 21:53:03] {2062} INFO - Data split method: time
+[flaml.automl: 02-28 21:53:03] {2066} INFO - Evaluation method: holdout
+[flaml.automl: 02-28 21:53:03] {2147} INFO - Minimizing error metric: 1-accuracy
+[flaml.automl: 02-28 21:53:03] {2205} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 0, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2573} INFO - Estimated sufficient time budget=269s. Estimated necessary time budget=0s.
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.1s,	estimator lgbm's best error=0.2667,	best estimator lgbm's best error=0.2667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 1, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.1s,	estimator lgbm's best error=0.2667,	best estimator lgbm's best error=0.2667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 2, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.1s,	estimator lgbm's best error=0.1333,	best estimator lgbm's best error=0.1333
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 3, current learner rf
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.2s,	estimator rf's best error=0.1333,	best estimator lgbm's best error=0.1333
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 4, current learner xgboost
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.2s,	estimator xgboost's best error=0.1333,	best estimator lgbm's best error=0.1333
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 5, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.2s,	estimator lgbm's best error=0.1333,	best estimator lgbm's best error=0.1333
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 6, current learner rf
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.3s,	estimator rf's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 7, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.3s,	estimator lgbm's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 8, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.3s,	estimator lgbm's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 9, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.4s,	estimator lgbm's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 10, current learner rf
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.4s,	estimator rf's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 11, current learner rf
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.4s,	estimator rf's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 12, current learner xgboost
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.5s,	estimator xgboost's best error=0.1333,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 13, current learner extra_tree
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.5s,	estimator extra_tree's best error=0.1333,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 14, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.5s,	estimator xgb_limitdepth's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 15, current learner xgboost
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.6s,	estimator xgboost's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 16, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.6s,	estimator xgb_limitdepth's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 17, current learner rf
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.6s,	estimator rf's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 18, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.7s,	estimator xgb_limitdepth's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 19, current learner lgbm
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.7s,	estimator lgbm's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 20, current learner extra_tree
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.7s,	estimator extra_tree's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 21, current learner xgboost
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.7s,	estimator xgboost's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 22, current learner extra_tree
+[flaml.automl: 02-28 21:53:03] {2620} INFO -  at 0.8s,	estimator extra_tree's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:03] {2458} INFO - iteration 23, current learner rf
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 0.8s,	estimator rf's best error=0.0667,	best estimator rf's best error=0.0667
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 24, current learner xgboost
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 0.9s,	estimator xgboost's best error=0.0333,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 25, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 0.9s,	estimator xgb_limitdepth's best error=0.0667,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 26, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 0.9s,	estimator xgb_limitdepth's best error=0.0667,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 27, current learner xgboost
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 0.9s,	estimator xgboost's best error=0.0333,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 28, current learner extra_tree
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 1.0s,	estimator extra_tree's best error=0.0667,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2458} INFO - iteration 29, current learner xgb_limitdepth
+[flaml.automl: 02-28 21:53:04] {2620} INFO -  at 1.0s,	estimator xgb_limitdepth's best error=0.0667,	best estimator xgboost's best error=0.0333
+[flaml.automl: 02-28 21:53:04] {2850} INFO - retrain xgboost for 0.0s
+[flaml.automl: 02-28 21:53:04] {2857} INFO - retrained model: XGBClassifier(base_score=0.5, booster='gbtree',
+              colsample_bylevel=0.9826753651836615, colsample_bynode=1,
+              colsample_bytree=0.9725493834064914, gamma=0, gpu_id=-1,
+              grow_policy='lossguide', importance_type='gain',
+              interaction_constraints='', learning_rate=0.1665803484560213,
+              max_delta_step=0, max_depth=0, max_leaves=4,
+              min_child_weight=0.5649012460525115, missing=nan,
+              monotone_constraints='()', n_estimators=4, n_jobs=-1,
+              num_parallel_tree=1, objective='binary:logistic', random_state=0,
+              reg_alpha=0.009638363373006869, reg_lambda=0.143703802530408,
+              scale_pos_weight=1, subsample=0.9643606787051899,
+              tree_method='hist', use_label_encoder=False,
+              validate_parameters=1, verbosity=0)
+[flaml.automl: 02-28 21:53:04] {2234} INFO - fit succeeded
+[flaml.automl: 02-28 21:53:04] {2235} INFO - Time taken to find the best model: 0.8547139167785645
+```
+
+### Forecasting with Panel Datasets
+
+Panel time series datasets involves multiple individual time series. For example, see Stallion demand dataset from PyTorch Forecasting, orginally from Kaggle.
+
+```python
+def get_stalliion_data():
+    from pytorch_forecasting.data.examples import get_stallion_data
+
+    data = get_stallion_data()
+    # add time index - For datasets with no missing values, FLAML will automate this process
+    data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
+    data["time_idx"] -= data["time_idx"].min()
+    # add additional features
+    data["month"] = data.date.dt.month.astype(str).astype(
+        "category"
+    )  # categories have be strings
+    data["log_volume"] = np.log(data.volume + 1e-8)
+    data["avg_volume_by_sku"] = data.groupby(
+        ["time_idx", "sku"], observed=True
+    ).volume.transform("mean")
+    data["avg_volume_by_agency"] = data.groupby(
+        ["time_idx", "agency"], observed=True
+    ).volume.transform("mean")
+    # we want to encode special days as one variable and thus need to first reverse one-hot encoding
+    special_days = [
+        "easter_day",
+        "good_friday",
+        "new_year",
+        "christmas",
+        "labor_day",
+        "independence_day",
+        "revolution_day_memorial",
+        "regional_games",
+        "beer_capital",
+        "music_fest",
+    ]
+    data[special_days] = (
+        data[special_days]
+        .apply(lambda x: x.map({0: "-", 1: x.name}))
+        .astype("category")
+    )
+    return data, special_days
+
+data, special_days = get_stalliion_data()
+time_horizon = 6  # predict six months
+training_cutoff = data["time_idx"].max() - time_horizon
+data["time_idx"] = data["time_idx"].astype("int")
+ts_col = data.pop("date")
+data.insert(0, "date", ts_col)
+# FLAML assumes input is not sorted, but we sort here for comparison purposes with y_test
+data = data.sort_values(["agency", "sku", "date"])
+X_train = data[lambda x: x.time_idx <= training_cutoff]
+X_test = data[lambda x: x.time_idx > training_cutoff]
+y_train = X_train.pop("volume")
+y_test = X_test.pop("volume")
+automl = AutoML()
+# Configure settings for FLAML model
+settings = {
+    "time_budget": budget,  # total running time in seconds
+    "metric": "mape",  # primary metric
+    "task": "ts_forecast_panel",  # task type
+    "log_file_name": "test/stallion_forecast.log",  # flaml log file
+    "eval_method": "holdout",
+}
+# Specify kwargs for TimeSeriesDataSet used by TemporalFusionTransformerEstimator
+fit_kwargs_by_estimator = {
+    "tft": {
+        "max_encoder_length": 24,
+        "static_categoricals": ["agency", "sku"],
+        "static_reals": ["avg_population_2017", "avg_yearly_household_income_2017"],
+        "time_varying_known_categoricals": ["special_days", "month"],
+        "variable_groups": {
+            "special_days": special_days
+        },  # group of categorical variables can be treated as one variable
+        "time_varying_known_reals": [
+            "time_idx",
+            "price_regular",
+            "discount_in_percent",
+        ],
+        "time_varying_unknown_categoricals": [],
+        "time_varying_unknown_reals": [
+            "y",  # always need a 'y' column for the target column
+            "log_volume",
+            "industry_volume",
+            "soda_volume",
+            "avg_max_temp",
+            "avg_volume_by_agency",
+            "avg_volume_by_sku",
+        ],
+        "batch_size": 256,
+        "max_epochs": 1,
+        "gpu_per_trial": -1,
+    }
+}
+# Train the model
+automl.fit(
+    X_train=X_train,
+    y_train=y_train,
+    **settings,
+    period=time_horizon,
+    group_ids=["agency", "sku"],
+    fit_kwargs_by_estimator=fit_kwargs_by_estimator,
+)
+# Compute predictions of testing dataset
+y_pred = automl.predict(X_test)
+print(y_test)
+print(y_pred)
+# best model
+print(automl.model.estimator)
+```
+
+#### Sample Output
+
+```
+[flaml.automl: 07-28 21:26:03] {2478} INFO - task = ts_forecast_panel
+[flaml.automl: 07-28 21:26:03] {2480} INFO - Data split method: time
+[flaml.automl: 07-28 21:26:03] {2483} INFO - Evaluation method: holdout
+[flaml.automl: 07-28 21:26:03] {2552} INFO - Minimizing error metric: mape
+[flaml.automl: 07-28 21:26:03] {2694} INFO - List of ML learners in AutoML Run: ['tft']
+[flaml.automl: 07-28 21:26:03] {2986} INFO - iteration 0, current learner tft
+GPU available: False, used: False
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+
+   | Name                               | Type                            | Params
+----------------------------------------------------------------------------------------
+0  | loss                               | QuantileLoss                    | 0
+1  | logging_metrics                    | ModuleList                      | 0
+2  | input_embeddings                   | MultiEmbedding                  | 1.3 K
+3  | prescalers                         | ModuleDict                      | 256
+4  | static_variable_selection          | VariableSelectionNetwork        | 3.4 K
+5  | encoder_variable_selection         | VariableSelectionNetwork        | 8.0 K
+6  | decoder_variable_selection         | VariableSelectionNetwork        | 2.7 K
+7  | static_context_variable_selection  | GatedResidualNetwork            | 1.1 K
+8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.1 K
+9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 1.1 K
+10 | static_context_enrichment          | GatedResidualNetwork            | 1.1 K
+11 | lstm_encoder                       | LSTM                            | 4.4 K
+12 | lstm_decoder                       | LSTM                            | 4.4 K
+13 | post_lstm_gate_encoder             | GatedLinearUnit                 | 544
+14 | post_lstm_add_norm_encoder         | AddNorm                         | 32
+15 | static_enrichment                  | GatedResidualNetwork            | 1.4 K
+16 | multihead_attn                     | InterpretableMultiHeadAttention | 676
+17 | post_attn_gate_norm                | GateAddNorm                     | 576
+18 | pos_wise_ff                        | GatedResidualNetwork            | 1.1 K
+19 | pre_output_gate_norm               | GateAddNorm                     | 576
+20 | output_layer                       | Linear                          | 119
+----------------------------------------------------------------------------------------
+33.6 K    Trainable params
+0         Non-trainable params
+33.6 K    Total params
+0.135     Total estimated model params size (MB)
+
+Epoch 19: 100%|██████████| 129/129 [00:56<00:00,  2.27it/s, loss=45.9, v_num=2, train_loss_step=43.00, val_loss=65.20, train_loss_epoch=46.50]
+
+[flaml.automl: 07-28 21:46:46] {3114} INFO - Estimated sufficient time budget=12424212s. Estimated necessary time budget=12424s.
+[flaml.automl: 07-28 21:46:46] {3161} INFO -  at 1242.6s,\testimator tft's best error=1324290483134574.7500,\tbest estimator tft's best error=1324290483134574.7500
+GPU available: False, used: False
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+
+   | Name                               | Type                            | Params
+----------------------------------------------------------------------------------------
+0  | loss                               | QuantileLoss                    | 0
+1  | logging_metrics                    | ModuleList                      | 0
+2  | input_embeddings                   | MultiEmbedding                  | 1.3 K
+3  | prescalers                         | ModuleDict                      | 256
+4  | static_variable_selection          | VariableSelectionNetwork        | 3.4 K
+5  | encoder_variable_selection         | VariableSelectionNetwork        | 8.0 K
+6  | decoder_variable_selection         | VariableSelectionNetwork        | 2.7 K
+7  | static_context_variable_selection  | GatedResidualNetwork            | 1.1 K
+8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.1 K
+9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 1.1 K
+10 | static_context_enrichment          | GatedResidualNetwork            | 1.1 K
+11 | lstm_encoder                       | LSTM                            | 4.4 K
+12 | lstm_decoder                       | LSTM                            | 4.4 K
+13 | post_lstm_gate_encoder             | GatedLinearUnit                 | 544
+14 | post_lstm_add_norm_encoder         | AddNorm                         | 32
+15 | static_enrichment                  | GatedResidualNetwork            | 1.4 K
+16 | multihead_attn                     | InterpretableMultiHeadAttention | 676
+17 | post_attn_gate_norm                | GateAddNorm                     | 576
+18 | pos_wise_ff                        | GatedResidualNetwork            | 1.1 K
+19 | pre_output_gate_norm               | GateAddNorm                     | 576
+20 | output_layer                       | Linear                          | 119
+----------------------------------------------------------------------------------------
+33.6 K    Trainable params
+0         Non-trainable params
+33.6 K    Total params
+0.135     Total estimated model params size (MB)
+Epoch 19: 100%|██████████| 145/145 [01:03<00:00,  2.28it/s, loss=45.2, v_num=3, train_loss_step=46.30, val_loss=67.60, train_loss_epoch=48.10]
+[flaml.automl: 07-28 22:08:05] {3425} INFO - retrain tft for 1279.6s
+[flaml.automl: 07-28 22:08:05] {3432} INFO - retrained model: TemporalFusionTransformer(
+  (loss): QuantileLoss()
+  (logging_metrics): ModuleList(
+    (0): SMAPE()
+    (1): MAE()
+    (2): RMSE()
+    (3): MAPE()
+  )
+  (input_embeddings): MultiEmbedding(
+    (embeddings): ModuleDict(
+      (agency): Embedding(58, 16)
+      (sku): Embedding(25, 10)
+      (special_days): TimeDistributedEmbeddingBag(11, 6, mode=sum)
+      (month): Embedding(12, 6)
+    )
+  )
+  (prescalers): ModuleDict(
+    (avg_population_2017): Linear(in_features=1, out_features=8, bias=True)
+    (avg_yearly_household_income_2017): Linear(in_features=1, out_features=8, bias=True)
+    (encoder_length): Linear(in_features=1, out_features=8, bias=True)
+    (y_center): Linear(in_features=1, out_features=8, bias=True)
+    (y_scale): Linear(in_features=1, out_features=8, bias=True)
+    (time_idx): Linear(in_features=1, out_features=8, bias=True)
+    (price_regular): Linear(in_features=1, out_features=8, bias=True)
+    (discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
+    (relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
+    (y): Linear(in_features=1, out_features=8, bias=True)
+    (log_volume): Linear(in_features=1, out_features=8, bias=True)
+    (industry_volume): Linear(in_features=1, out_features=8, bias=True)
+    (soda_volume): Linear(in_features=1, out_features=8, bias=True)
+    (avg_max_temp): Linear(in_features=1, out_features=8, bias=True)
+    (avg_volume_by_agency): Linear(in_features=1, out_features=8, bias=True)
+    (avg_volume_by_sku): Linear(in_features=1, out_features=8, bias=True)
+  )
+  (static_variable_selection): VariableSelectionNetwork(
+    (flattened_grn): GatedResidualNetwork(
+      (resample_norm): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
+      )
+      (fc1): Linear(in_features=66, out_features=7, bias=True)
+      (elu): ELU(alpha=1.0)
+      (fc2): Linear(in_features=7, out_features=7, bias=True)
+      (gate_norm): GateAddNorm(
+        (glu): GatedLinearUnit(
+          (dropout): Dropout(p=0.1, inplace=False)
+          (fc): Linear(in_features=7, out_features=14, bias=True)
+        )
+        (add_norm): AddNorm(
+          (norm): LayerNorm((7,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+    (single_variable_grns): ModuleDict(
+      (agency): ResampleNorm(
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (sku): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (avg_population_2017): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (avg_yearly_household_income_2017): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (encoder_length): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (y_center): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (y_scale): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+    )
+    (prescalers): ModuleDict(
+      (avg_population_2017): Linear(in_features=1, out_features=8, bias=True)
+      (avg_yearly_household_income_2017): Linear(in_features=1, out_features=8, bias=True)
+      (encoder_length): Linear(in_features=1, out_features=8, bias=True)
+      (y_center): Linear(in_features=1, out_features=8, bias=True)
+      (y_scale): Linear(in_features=1, out_features=8, bias=True)
+    )
+    (softmax): Softmax(dim=-1)
+  )
+  (encoder_variable_selection): VariableSelectionNetwork(
+    (flattened_grn): GatedResidualNetwork(
+      (resample_norm): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((13,), eps=1e-05, elementwise_affine=True)
+      )
+      (fc1): Linear(in_features=100, out_features=13, bias=True)
+      (elu): ELU(alpha=1.0)
+      (context): Linear(in_features=16, out_features=13, bias=False)
+      (fc2): Linear(in_features=13, out_features=13, bias=True)
+      (gate_norm): GateAddNorm(
+        (glu): GatedLinearUnit(
+          (dropout): Dropout(p=0.1, inplace=False)
+          (fc): Linear(in_features=13, out_features=26, bias=True)
+        )
+        (add_norm): AddNorm(
+          (norm): LayerNorm((13,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+    (single_variable_grns): ModuleDict(
+      (special_days): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (month): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (time_idx): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (price_regular): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (discount_in_percent): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (relative_time_idx): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (y): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (log_volume): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (industry_volume): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (soda_volume): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (avg_max_temp): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (avg_volume_by_agency): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (avg_volume_by_sku): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+    )
+    (prescalers): ModuleDict(
+      (time_idx): Linear(in_features=1, out_features=8, bias=True)
+      (price_regular): Linear(in_features=1, out_features=8, bias=True)
+      (discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
+      (relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
+      (y): Linear(in_features=1, out_features=8, bias=True)
+      (log_volume): Linear(in_features=1, out_features=8, bias=True)
+      (industry_volume): Linear(in_features=1, out_features=8, bias=True)
+      (soda_volume): Linear(in_features=1, out_features=8, bias=True)
+      (avg_max_temp): Linear(in_features=1, out_features=8, bias=True)
+      (avg_volume_by_agency): Linear(in_features=1, out_features=8, bias=True)
+      (avg_volume_by_sku): Linear(in_features=1, out_features=8, bias=True)
+    )
+    (softmax): Softmax(dim=-1)
+  )
+  (decoder_variable_selection): VariableSelectionNetwork(
+    (flattened_grn): GatedResidualNetwork(
+      (resample_norm): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
+      )
+      (fc1): Linear(in_features=44, out_features=6, bias=True)
+      (elu): ELU(alpha=1.0)
+      (context): Linear(in_features=16, out_features=6, bias=False)
+      (fc2): Linear(in_features=6, out_features=6, bias=True)
+      (gate_norm): GateAddNorm(
+        (glu): GatedLinearUnit(
+          (dropout): Dropout(p=0.1, inplace=False)
+          (fc): Linear(in_features=6, out_features=12, bias=True)
+        )
+        (add_norm): AddNorm(
+          (norm): LayerNorm((6,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+    (single_variable_grns): ModuleDict(
+      (special_days): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (month): ResampleNorm(
+        (resample): TimeDistributedInterpolation()
+        (gate): Sigmoid()
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+      (time_idx): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (price_regular): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (discount_in_percent): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+      (relative_time_idx): GatedResidualNetwork(
+        (resample_norm): ResampleNorm(
+          (resample): TimeDistributedInterpolation()
+          (gate): Sigmoid()
+          (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+        )
+        (fc1): Linear(in_features=8, out_features=8, bias=True)
+        (elu): ELU(alpha=1.0)
+        (fc2): Linear(in_features=8, out_features=8, bias=True)
+        (gate_norm): GateAddNorm(
+          (glu): GatedLinearUnit(
+            (dropout): Dropout(p=0.1, inplace=False)
+            (fc): Linear(in_features=8, out_features=32, bias=True)
+          )
+          (add_norm): AddNorm(
+            (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+      )
+    )
+    (prescalers): ModuleDict(
+      (time_idx): Linear(in_features=1, out_features=8, bias=True)
+      (price_regular): Linear(in_features=1, out_features=8, bias=True)
+      (discount_in_percent): Linear(in_features=1, out_features=8, bias=True)
+      (relative_time_idx): Linear(in_features=1, out_features=8, bias=True)
+    )
+    (softmax): Softmax(dim=-1)
+  )
+  (static_context_variable_selection): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (static_context_initial_hidden_lstm): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (static_context_initial_cell_lstm): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (static_context_enrichment): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (lstm_encoder): LSTM(16, 16, num_layers=2, batch_first=True, dropout=0.1)
+  (lstm_decoder): LSTM(16, 16, num_layers=2, batch_first=True, dropout=0.1)
+  (post_lstm_gate_encoder): GatedLinearUnit(
+    (dropout): Dropout(p=0.1, inplace=False)
+    (fc): Linear(in_features=16, out_features=32, bias=True)
+  )
+  (post_lstm_gate_decoder): GatedLinearUnit(
+    (dropout): Dropout(p=0.1, inplace=False)
+    (fc): Linear(in_features=16, out_features=32, bias=True)
+  )
+  (post_lstm_add_norm_encoder): AddNorm(
+    (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+  )
+  (post_lstm_add_norm_decoder): AddNorm(
+    (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+  )
+  (static_enrichment): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (context): Linear(in_features=16, out_features=16, bias=False)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (multihead_attn): InterpretableMultiHeadAttention(
+    (dropout): Dropout(p=0.1, inplace=False)
+    (v_layer): Linear(in_features=16, out_features=4, bias=True)
+    (q_layers): ModuleList(
+      (0): Linear(in_features=16, out_features=4, bias=True)
+      (1): Linear(in_features=16, out_features=4, bias=True)
+      (2): Linear(in_features=16, out_features=4, bias=True)
+      (3): Linear(in_features=16, out_features=4, bias=True)
+    )
+    (k_layers): ModuleList(
+      (0): Linear(in_features=16, out_features=4, bias=True)
+      (1): Linear(in_features=16, out_features=4, bias=True)
+      (2): Linear(in_features=16, out_features=4, bias=True)
+      (3): Linear(in_features=16, out_features=4, bias=True)
+    )
+    (attention): ScaledDotProductAttention(
+      (softmax): Softmax(dim=2)
+    )
+    (w_h): Linear(in_features=4, out_features=16, bias=False)
+  )
+  (post_attn_gate_norm): GateAddNorm(
+    (glu): GatedLinearUnit(
+      (dropout): Dropout(p=0.1, inplace=False)
+      (fc): Linear(in_features=16, out_features=32, bias=True)
+    )
+    (add_norm): AddNorm(
+      (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+    )
+  )
+  (pos_wise_ff): GatedResidualNetwork(
+    (fc1): Linear(in_features=16, out_features=16, bias=True)
+    (elu): ELU(alpha=1.0)
+    (fc2): Linear(in_features=16, out_features=16, bias=True)
+    (gate_norm): GateAddNorm(
+      (glu): GatedLinearUnit(
+        (dropout): Dropout(p=0.1, inplace=False)
+        (fc): Linear(in_features=16, out_features=32, bias=True)
+      )
+      (add_norm): AddNorm(
+        (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+  )
+  (pre_output_gate_norm): GateAddNorm(
+    (glu): GatedLinearUnit(
+      (fc): Linear(in_features=16, out_features=32, bias=True)
+    )
+    (add_norm): AddNorm(
+      (norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
+    )
+  )
+  (output_layer): Linear(in_features=16, out_features=7, bias=True)
+)
+[flaml.automl: 07-28 22:08:05] {2725} INFO - fit succeeded
+[flaml.automl: 07-28 22:08:05] {2726} INFO - Time taken to find the best model: 1242.6435902118683
+[flaml.automl: 07-28 22:08:05] {2737} WARNING - Time taken to find the best model is 414% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n"
+     ]
+    }
+   ],
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_time_series_forecast.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_time_series_forecast.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-for-LightGBM.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-for-LightGBM.md
new file mode 100644
index 0000000000..0094dd561f
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-for-LightGBM.md
@@ -0,0 +1,209 @@
+# AutoML for LightGBM
+
+### Prerequisites for this example
+
+Install the [notebook] option.
+```bash
+pip install "flaml[notebook]"
+```
+
+This option is not necessary in general.
+
+### Use built-in LGBMEstimator
+
+```python
+from flaml import AutoML
+from flaml.automl.data import load_openml_dataset
+
+# Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region.
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')
+
+automl = AutoML()
+settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']
+    "estimator_list": ['lgbm'],  # list of ML learners; we tune lightgbm in this example
+    "task": 'regression',  # task type
+    "log_file_name": 'houses_experiment.log',  # flaml log file
+    "seed": 7654321,  # random seed
+}
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 11-15 19:46:44] {1485} INFO - Data split method: uniform
+[flaml.automl: 11-15 19:46:44] {1489} INFO - Evaluation method: cv
+[flaml.automl: 11-15 19:46:44] {1540} INFO - Minimizing error metric: 1-r2
+[flaml.automl: 11-15 19:46:44] {1577} INFO - List of ML learners in AutoML Run: ['lgbm']
+[flaml.automl: 11-15 19:46:44] {1826} INFO - iteration 0, current learner lgbm
+[flaml.automl: 11-15 19:46:44] {1944} INFO - Estimated sufficient time budget=3232s. Estimated necessary time budget=3s.
+[flaml.automl: 11-15 19:46:44] {2029} INFO -  at 0.5s,	estimator lgbm's best error=0.7383,	best estimator lgbm's best error=0.7383
+[flaml.automl: 11-15 19:46:44] {1826} INFO - iteration 1, current learner lgbm
+[flaml.automl: 11-15 19:46:44] {2029} INFO -  at 0.6s,	estimator lgbm's best error=0.4774,	best estimator lgbm's best error=0.4774
+[flaml.automl: 11-15 19:46:44] {1826} INFO - iteration 2, current learner lgbm
+[flaml.automl: 11-15 19:46:44] {2029} INFO -  at 0.7s,	estimator lgbm's best error=0.4774,	best estimator lgbm's best error=0.4774
+[flaml.automl: 11-15 19:46:44] {1826} INFO - iteration 3, current learner lgbm
+[flaml.automl: 11-15 19:46:44] {2029} INFO -  at 0.9s,	estimator lgbm's best error=0.2985,	best estimator lgbm's best error=0.2985
+[flaml.automl: 11-15 19:46:44] {1826} INFO - iteration 4, current learner lgbm
+[flaml.automl: 11-15 19:46:45] {2029} INFO -  at 1.3s,	estimator lgbm's best error=0.2337,	best estimator lgbm's best error=0.2337
+[flaml.automl: 11-15 19:46:45] {1826} INFO - iteration 5, current learner lgbm
+[flaml.automl: 11-15 19:46:45] {2029} INFO -  at 1.4s,	estimator lgbm's best error=0.2337,	best estimator lgbm's best error=0.2337
+[flaml.automl: 11-15 19:46:45] {1826} INFO - iteration 6, current learner lgbm
+[flaml.automl: 11-15 19:46:46] {2029} INFO -  at 2.5s,	estimator lgbm's best error=0.2219,	best estimator lgbm's best error=0.2219
+[flaml.automl: 11-15 19:46:46] {1826} INFO - iteration 7, current learner lgbm
+[flaml.automl: 11-15 19:46:46] {2029} INFO -  at 2.9s,	estimator lgbm's best error=0.2219,	best estimator lgbm's best error=0.2219
+[flaml.automl: 11-15 19:46:46] {1826} INFO - iteration 8, current learner lgbm
+[flaml.automl: 11-15 19:46:48] {2029} INFO -  at 4.5s,	estimator lgbm's best error=0.1764,	best estimator lgbm's best error=0.1764
+[flaml.automl: 11-15 19:46:48] {1826} INFO - iteration 9, current learner lgbm
+[flaml.automl: 11-15 19:46:54] {2029} INFO -  at 10.5s,	estimator lgbm's best error=0.1630,	best estimator lgbm's best error=0.1630
+[flaml.automl: 11-15 19:46:54] {1826} INFO - iteration 10, current learner lgbm
+[flaml.automl: 11-15 19:46:56] {2029} INFO -  at 12.4s,	estimator lgbm's best error=0.1630,	best estimator lgbm's best error=0.1630
+[flaml.automl: 11-15 19:46:56] {1826} INFO - iteration 11, current learner lgbm
+[flaml.automl: 11-15 19:47:13] {2029} INFO -  at 29.0s,	estimator lgbm's best error=0.1630,	best estimator lgbm's best error=0.1630
+[flaml.automl: 11-15 19:47:13] {1826} INFO - iteration 12, current learner lgbm
+[flaml.automl: 11-15 19:47:15] {2029} INFO -  at 31.1s,	estimator lgbm's best error=0.1630,	best estimator lgbm's best error=0.1630
+[flaml.automl: 11-15 19:47:15] {1826} INFO - iteration 13, current learner lgbm
+[flaml.automl: 11-15 19:47:29] {2029} INFO -  at 45.8s,	estimator lgbm's best error=0.1564,	best estimator lgbm's best error=0.1564
+[flaml.automl: 11-15 19:47:33] {2242} INFO - retrain lgbm for 3.2s
+[flaml.automl: 11-15 19:47:33] {2247} INFO - retrained model: LGBMRegressor(colsample_bytree=0.8025848209352517,
+              learning_rate=0.09100963138990374, max_bin=255,
+              min_child_samples=42, n_estimators=363, num_leaves=216,
+              reg_alpha=0.001113000336715291, reg_lambda=76.50614276906414,
+              verbose=-1)
+[flaml.automl: 11-15 19:47:33] {1608} INFO - fit succeeded
+[flaml.automl: 11-15 19:47:33] {1610} INFO - Time taken to find the best model: 45.75616669654846
+[flaml.automl: 11-15 19:47:33] {1624} WARNING - Time taken to find the best model is 76% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
+
+#### Retrieve best config
+
+```python
+print('Best hyperparmeter config:', automl.best_config)
+print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
+print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
+print(automl.model.estimator)
+# Best hyperparmeter config: {'n_estimators': 363, 'num_leaves': 216, 'min_child_samples': 42, 'learning_rate': 0.09100963138990374, 'log_max_bin': 8, 'colsample_bytree': 0.8025848209352517, 'reg_alpha': 0.001113000336715291, 'reg_lambda': 76.50614276906414}
+# Best r2 on validation data: 0.8436
+# Training duration of best run: 3.229 s
+# LGBMRegressor(colsample_bytree=0.8025848209352517,
+#               learning_rate=0.09100963138990374, max_bin=255,
+#               min_child_samples=42, n_estimators=363, num_leaves=216,
+#               reg_alpha=0.001113000336715291, reg_lambda=76.50614276906414,
+#               verbose=-1)
+```
+
+#### Plot feature importance
+
+```python
+import matplotlib.pyplot as plt
+plt.barh(automl.feature_names_in_, automl.feature_importances_)
+```
+![png](../Use-Cases/images/feature_importance.png)
+
+#### Compute predictions of testing dataset
+
+```python
+y_pred = automl.predict(X_test)
+print('Predicted labels', y_pred)
+# Predicted labels [143391.65036562 245535.13731811 153171.44071629 ... 184354.52735963
+#  235510.49470445 282617.22858956]
+```
+
+#### Compute different metric values on testing dataset
+
+```python
+from flaml.automl.ml import sklearn_metric_loss_score
+
+print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
+print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
+print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
+# r2 = 0.8505434326526395
+# mse = 1975592613.138005
+# mae = 29471.536046068788
+```
+
+#### Compare with untuned LightGBM
+
+```python
+from lightgbm import LGBMRegressor
+
+lgbm = LGBMRegressor()
+lgbm.fit(X_train, y_train)
+y_pred = lgbm.predict(X_test)
+from flaml.automl.ml import sklearn_metric_loss_score
+
+print('default lgbm r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
+# default lgbm r2 = 0.8296179648694404
+```
+
+#### Plot learning curve
+
+How does the model accuracy improve as we search for different hyperparameter configurations?
+
+```python
+from flaml.automl.data import get_output_from_log
+import numpy as np
+
+time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history =
+    get_output_from_log(filename=settings['log_file_name'], time_budget=60)
+plt.title('Learning Curve')
+plt.xlabel('Wall Clock Time (s)')
+plt.ylabel('Validation r2')
+plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
+plt.show()
+```
+![png](images/lgbm_curve.png)
+
+### Use a customized LightGBM learner
+
+The native API of LightGBM allows one to specify a custom objective function in the model constructor. You can easily enable it by adding a customized LightGBM learner in FLAML. In the following example, we show how to add such a customized LightGBM learner with a custom objective function.
+
+#### Create a customized LightGBM learner with a custom objective function
+
+```python
+import numpy as np
+
+
+# define your customized objective function
+def my_loss_obj(y_true, y_pred):
+    c = 0.5
+    residual = y_pred - y_true
+    grad = c * residual / (np.abs(residual) + c)
+    hess = c ** 2 / (np.abs(residual) + c) ** 2
+    # rmse grad and hess
+    grad_rmse = residual
+    hess_rmse = 1.0
+
+    # mae grad and hess
+    grad_mae = np.array(residual)
+    grad_mae[grad_mae > 0] = 1.
+    grad_mae[grad_mae <= 0] = -1.
+    hess_mae = 1.0
+
+    coef = [0.4, 0.3, 0.3]
+    return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae,
+           coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae
+
+
+from flaml.automl.model import LGBMEstimator
+
+
+class MyLGBM(LGBMEstimator):
+    """LGBMEstimator with my_loss_obj as the objective function"""
+
+    def __init__(self, **config):
+        super().__init__(objective=my_loss_obj, **config)
+```
+
+#### Add the customized learner and tune it
+
+```python
+automl = AutoML()
+automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)
+settings["estimator_list"] = ['my_lgbm']  # change the estimator list
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_lightgbm.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_lightgbm.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/AutoML-for-XGBoost.md b/website/versioned_docs/version-1.0.4/Examples/AutoML-for-XGBoost.md
new file mode 100644
index 0000000000..774050a53a
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/AutoML-for-XGBoost.md
@@ -0,0 +1,234 @@
+# AutoML for XGBoost
+
+### Prerequisites for this example
+
+Install the [notebook] option.
+```bash
+pip install "flaml[notebook]"
+```
+
+This option is not necessary in general.
+
+### Use built-in XGBoostSklearnEstimator
+
+```python
+from flaml import AutoML
+from flaml.automl.data import load_openml_dataset
+
+# Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region.
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')
+
+automl = AutoML()
+settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']
+    "estimator_list": ['xgboost'],  # list of ML learners; we tune XGBoost in this example
+    "task": 'regression',  # task type
+    "log_file_name": 'houses_experiment.log',  # flaml log file
+    "seed": 7654321,  # random seed
+}
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+```
+
+#### Sample output
+
+```
+[flaml.automl: 09-29 23:06:46] {1446} INFO - Data split method: uniform
+[flaml.automl: 09-29 23:06:46] {1450} INFO - Evaluation method: cv
+[flaml.automl: 09-29 23:06:46] {1496} INFO - Minimizing error metric: 1-r2
+[flaml.automl: 09-29 23:06:46] {1533} INFO - List of ML learners in AutoML Run: ['xgboost']
+[flaml.automl: 09-29 23:06:46] {1763} INFO - iteration 0, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1880} INFO - Estimated sufficient time budget=2621s. Estimated necessary time budget=3s.
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 0.3s,	estimator xgboost's best error=2.1267,	best estimator xgboost's best error=2.1267
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 1, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 0.5s,	estimator xgboost's best error=2.1267,	best estimator xgboost's best error=2.1267
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 2, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 0.6s,	estimator xgboost's best error=0.8485,	best estimator xgboost's best error=0.8485
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 3, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 0.8s,	estimator xgboost's best error=0.3799,	best estimator xgboost's best error=0.3799
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 4, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 1.0s,	estimator xgboost's best error=0.3799,	best estimator xgboost's best error=0.3799
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 5, current learner xgboost
+[flaml.automl: 09-29 23:06:47] {1952} INFO -  at 1.2s,	estimator xgboost's best error=0.3799,	best estimator xgboost's best error=0.3799
+[flaml.automl: 09-29 23:06:47] {1763} INFO - iteration 6, current learner xgboost
+[flaml.automl: 09-29 23:06:48] {1952} INFO -  at 1.5s,	estimator xgboost's best error=0.2992,	best estimator xgboost's best error=0.2992
+[flaml.automl: 09-29 23:06:48] {1763} INFO - iteration 7, current learner xgboost
+[flaml.automl: 09-29 23:06:48] {1952} INFO -  at 1.9s,	estimator xgboost's best error=0.2992,	best estimator xgboost's best error=0.2992
+[flaml.automl: 09-29 23:06:48] {1763} INFO - iteration 8, current learner xgboost
+[flaml.automl: 09-29 23:06:49] {1952} INFO -  at 2.2s,	estimator xgboost's best error=0.2992,	best estimator xgboost's best error=0.2992
+[flaml.automl: 09-29 23:06:49] {1763} INFO - iteration 9, current learner xgboost
+[flaml.automl: 09-29 23:06:49] {1952} INFO -  at 2.5s,	estimator xgboost's best error=0.2513,	best estimator xgboost's best error=0.2513
+[flaml.automl: 09-29 23:06:49] {1763} INFO - iteration 10, current learner xgboost
+[flaml.automl: 09-29 23:06:49] {1952} INFO -  at 2.8s,	estimator xgboost's best error=0.2513,	best estimator xgboost's best error=0.2513
+[flaml.automl: 09-29 23:06:49] {1763} INFO - iteration 11, current learner xgboost
+[flaml.automl: 09-29 23:06:49] {1952} INFO -  at 3.0s,	estimator xgboost's best error=0.2513,	best estimator xgboost's best error=0.2513
+[flaml.automl: 09-29 23:06:49] {1763} INFO - iteration 12, current learner xgboost
+[flaml.automl: 09-29 23:06:50] {1952} INFO -  at 3.3s,	estimator xgboost's best error=0.2113,	best estimator xgboost's best error=0.2113
+[flaml.automl: 09-29 23:06:50] {1763} INFO - iteration 13, current learner xgboost
+[flaml.automl: 09-29 23:06:50] {1952} INFO -  at 3.5s,	estimator xgboost's best error=0.2113,	best estimator xgboost's best error=0.2113
+[flaml.automl: 09-29 23:06:50] {1763} INFO - iteration 14, current learner xgboost
+[flaml.automl: 09-29 23:06:50] {1952} INFO -  at 4.0s,	estimator xgboost's best error=0.2090,	best estimator xgboost's best error=0.2090
+[flaml.automl: 09-29 23:06:50] {1763} INFO - iteration 15, current learner xgboost
+[flaml.automl: 09-29 23:06:51] {1952} INFO -  at 4.5s,	estimator xgboost's best error=0.2090,	best estimator xgboost's best error=0.2090
+[flaml.automl: 09-29 23:06:51] {1763} INFO - iteration 16, current learner xgboost
+[flaml.automl: 09-29 23:06:51] {1952} INFO -  at 5.2s,	estimator xgboost's best error=0.1919,	best estimator xgboost's best error=0.1919
+[flaml.automl: 09-29 23:06:51] {1763} INFO - iteration 17, current learner xgboost
+[flaml.automl: 09-29 23:06:52] {1952} INFO -  at 5.5s,	estimator xgboost's best error=0.1919,	best estimator xgboost's best error=0.1919
+[flaml.automl: 09-29 23:06:52] {1763} INFO - iteration 18, current learner xgboost
+[flaml.automl: 09-29 23:06:54] {1952} INFO -  at 8.0s,	estimator xgboost's best error=0.1797,	best estimator xgboost's best error=0.1797
+[flaml.automl: 09-29 23:06:54] {1763} INFO - iteration 19, current learner xgboost
+[flaml.automl: 09-29 23:06:55] {1952} INFO -  at 9.0s,	estimator xgboost's best error=0.1797,	best estimator xgboost's best error=0.1797
+[flaml.automl: 09-29 23:06:55] {1763} INFO - iteration 20, current learner xgboost
+[flaml.automl: 09-29 23:07:08] {1952} INFO -  at 21.8s,	estimator xgboost's best error=0.1797,	best estimator xgboost's best error=0.1797
+[flaml.automl: 09-29 23:07:08] {1763} INFO - iteration 21, current learner xgboost
+[flaml.automl: 09-29 23:07:11] {1952} INFO -  at 24.4s,	estimator xgboost's best error=0.1797,	best estimator xgboost's best error=0.1797
+[flaml.automl: 09-29 23:07:11] {1763} INFO - iteration 22, current learner xgboost
+[flaml.automl: 09-29 23:07:16] {1952} INFO -  at 30.0s,	estimator xgboost's best error=0.1782,	best estimator xgboost's best error=0.1782
+[flaml.automl: 09-29 23:07:16] {1763} INFO - iteration 23, current learner xgboost
+[flaml.automl: 09-29 23:07:20] {1952} INFO -  at 33.5s,	estimator xgboost's best error=0.1782,	best estimator xgboost's best error=0.1782
+[flaml.automl: 09-29 23:07:20] {1763} INFO - iteration 24, current learner xgboost
+[flaml.automl: 09-29 23:07:29] {1952} INFO -  at 42.3s,	estimator xgboost's best error=0.1782,	best estimator xgboost's best error=0.1782
+[flaml.automl: 09-29 23:07:29] {1763} INFO - iteration 25, current learner xgboost
+[flaml.automl: 09-29 23:07:30] {1952} INFO -  at 43.2s,	estimator xgboost's best error=0.1782,	best estimator xgboost's best error=0.1782
+[flaml.automl: 09-29 23:07:30] {1763} INFO - iteration 26, current learner xgboost
+[flaml.automl: 09-29 23:07:50] {1952} INFO -  at 63.4s,	estimator xgboost's best error=0.1663,	best estimator xgboost's best error=0.1663
+[flaml.automl: 09-29 23:07:50] {2059} INFO - selected model: <xgboost.core.Booster object at 0x7f6399005910>
+[flaml.automl: 09-29 23:07:55] {2122} INFO - retrain xgboost for 5.4s
+[flaml.automl: 09-29 23:07:55] {2128} INFO - retrained model: <xgboost.core.Booster object at 0x7f6398fc0eb0>
+[flaml.automl: 09-29 23:07:55] {1557} INFO - fit succeeded
+[flaml.automl: 09-29 23:07:55] {1558} INFO - Time taken to find the best model: 63.427649974823
+[flaml.automl: 09-29 23:07:55] {1569} WARNING - Time taken to find the best model is 106% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+```
+
+#### Retrieve best config
+
+```python
+print('Best hyperparmeter config:', automl.best_config)
+print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))
+print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
+print(automl.model.estimator)
+# Best hyperparmeter config: {'n_estimators': 473, 'max_leaves': 35, 'max_depth': 0, 'min_child_weight': 0.001, 'learning_rate': 0.26865031351923346, 'subsample': 0.9718245679598786, 'colsample_bylevel': 0.7421362469066445, 'colsample_bytree': 1.0, 'reg_alpha': 0.06824336834995245, 'reg_lambda': 250.9654222583276}
+# Best r2 on validation data: 0.8384
+# Training duration of best run: 2.194 s
+# XGBRegressor(base_score=0.5, booster='gbtree',
+#              colsample_bylevel=0.7421362469066445, colsample_bynode=1,
+#              colsample_bytree=1.0, gamma=0, gpu_id=-1, grow_policy='lossguide',
+#              importance_type='gain', interaction_constraints='',
+#              learning_rate=0.26865031351923346, max_delta_step=0, max_depth=0,
+#              max_leaves=35, min_child_weight=0.001, missing=nan,
+#              monotone_constraints='()', n_estimators=473, n_jobs=-1,
+#              num_parallel_tree=1, random_state=0, reg_alpha=0.06824336834995245,
+#              reg_lambda=250.9654222583276, scale_pos_weight=1,
+#              subsample=0.9718245679598786, tree_method='hist',
+#              use_label_encoder=False, validate_parameters=1, verbosity=0)
+```
+
+#### Plot feature importance
+
+```python
+import matplotlib.pyplot as plt
+
+plt.barh(automl.feature_names_in_, automl.feature_importances_)
+```
+![png](images/xgb_feature_importance.png)
+
+#### Compute predictions of testing dataset
+
+```python
+y_pred = automl.predict(X_test)
+print('Predicted labels', y_pred)
+# Predicted labels [139062.95 237622.   140522.03 ... 182125.5  252156.36 264884.5 ]
+```
+
+#### Compute different metric values on testing dataset
+
+```python
+from flaml.automl.ml import sklearn_metric_loss_score
+
+print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
+print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
+print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
+# r2 = 0.8456494234135888
+# mse = 2040284106.2781258
+# mae = 30212.830996680445
+```
+
+#### Compare with untuned XGBoost
+
+```python
+from xgboost import XGBRegressor
+
+xgb = XGBRegressor()
+xgb.fit(X_train, y_train)
+y_pred = xgb.predict(X_test)
+from flaml.automl.ml import sklearn_metric_loss_score
+
+print('default xgboost r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
+# default xgboost r2 = 0.8265451174596482
+```
+
+#### Plot learning curve
+
+How does the model accuracy improve as we search for different hyperparameter configurations?
+
+```python
+from flaml.automl.data import get_output_from_log
+import numpy as np
+
+time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history =
+    get_output_from_log(filename=settings['log_file_name'], time_budget=60)
+plt.title('Learning Curve')
+plt.xlabel('Wall Clock Time (s)')
+plt.ylabel('Validation r2')
+plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
+plt.show()
+```
+![png](images/xgb_curve.png)
+
+### Use a customized XGBoost learner
+
+You can easily enable a custom objective function by adding a customized XGBoost learner (inherit XGBoostEstimator or XGBoostSklearnEstimator) in FLAML. In the following example, we show how to add such a customized XGBoost learner with a custom objective function.
+
+```python
+import numpy as np
+
+
+# define your customized objective function
+def logregobj(preds, dtrain):
+    labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))  # transform raw leaf weight
+    grad = preds - labels
+    hess = preds * (1.0 - preds)
+    return grad, hess
+
+
+from flaml.automl.model import XGBoostEstimator
+
+
+class MyXGB1(XGBoostEstimator):
+    '''XGBoostEstimator with the logregobj function as the objective function
+    '''
+
+    def __init__(self, **config):
+        super().__init__(objective=logregobj, **config)
+
+
+class MyXGB2(XGBoostEstimator):
+    '''XGBoostEstimator with 'reg:squarederror' as the objective function
+    '''
+
+    def __init__(self, **config):
+        super().__init__(objective='reg:gamma', **config)
+```
+
+#### Add the customized learners and tune them
+
+```python
+automl = AutoML()
+automl.add_learner(learner_name='my_xgb1', learner_class=MyXGB1)
+automl.add_learner(learner_name='my_xgb2', learner_class=MyXGB2)
+settings["estimator_list"] = ['my_xgb1', 'my_xgb2']  # change the estimator list
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_xgboost.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_xgboost.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/Default-Flamlized.md b/website/versioned_docs/version-1.0.4/Examples/Default-Flamlized.md
new file mode 100644
index 0000000000..d87ee47f9b
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Default-Flamlized.md
@@ -0,0 +1,99 @@
+# Default - Flamlized Estimator
+
+Flamlized estimators automatically use data-dependent default hyperparameter configurations for each estimator, offering a unique zero-shot AutoML capability, or "no tuning" AutoML.
+
+This example requires openml==0.10.2.
+
+## Flamlized LGBMRegressor
+
+### Zero-shot AutoML
+
+```python
+from flaml.automl.data import load_openml_dataset
+from flaml.default import LGBMRegressor
+from flaml.automl.ml import sklearn_metric_loss_score
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
+lgbm = LGBMRegressor()
+lgbm.fit(X_train, y_train)
+y_pred = lgbm.predict(X_test)
+print("flamlized lgbm r2", "=", 1 - sklearn_metric_loss_score("r2", y_pred, y_test))
+print(lgbm)
+```
+
+#### Sample output
+
+```
+load dataset from ./openml_ds537.pkl
+Dataset name: houses
+X_train.shape: (15480, 8), y_train.shape: (15480,);
+X_test.shape: (5160, 8), y_test.shape: (5160,)
+flamlized lgbm r2 = 0.8537444671194614
+LGBMRegressor(colsample_bytree=0.7019911744574896,
+              learning_rate=0.022635758411078528, max_bin=511,
+              min_child_samples=2, n_estimators=4797, num_leaves=122,
+              reg_alpha=0.004252223402511765, reg_lambda=0.11288241427227624,
+              verbose=-1)
+```
+
+### Suggest hyperparameters without training
+
+```
+from flaml.data import load_openml_dataset
+from flaml.default import LGBMRegressor
+from flaml.ml import sklearn_metric_loss_score
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
+lgbm = LGBMRegressor()
+hyperparams, estimator_name, X_transformed, y_transformed = lgbm.suggest_hyperparams(X_train, y_train)
+print(hyperparams)
+```
+
+#### Sample output
+```
+load dataset from ./openml_ds537.pkl
+Dataset name: houses
+X_train.shape: (15480, 8), y_train.shape: (15480,);
+X_test.shape: (5160, 8), y_test.shape: (5160,)
+{'n_estimators': 4797, 'num_leaves': 122, 'min_child_samples': 2, 'learning_rate': 0.022635758411078528, 'colsample_bytree': 0.7019911744574896, 'reg_alpha': 0.004252223402511765, 'reg_lambda': 0.11288241427227624, 'max_bin': 511, 'verbose': -1}
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/zeroshot_lightgbm.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/zeroshot_lightgbm.ipynb)
+
+## Flamlized XGBClassifier
+
+### Zero-shot AutoML
+
+```python
+from flaml.automl.data import load_openml_dataset
+from flaml.default import XGBClassifier
+from flaml.automl.ml import sklearn_metric_loss_score
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
+xgb = XGBClassifier()
+xgb.fit(X_train, y_train)
+y_pred = xgb.predict(X_test)
+print("flamlized xgb accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test))
+print(xgb)
+```
+
+#### Sample output
+
+```
+load dataset from ./openml_ds1169.pkl
+Dataset name: airlines
+X_train.shape: (404537, 7), y_train.shape: (404537,);
+X_test.shape: (134846, 7), y_test.shape: (134846,)
+flamlized xgb accuracy = 0.6729009388487608
+XGBClassifier(base_score=0.5, booster='gbtree',
+              colsample_bylevel=0.4601573737792679, colsample_bynode=1,
+              colsample_bytree=1.0, gamma=0, gpu_id=-1, grow_policy='lossguide',
+              importance_type='gain', interaction_constraints='',
+              learning_rate=0.04039771837785377, max_delta_step=0, max_depth=0,
+              max_leaves=159, min_child_weight=0.3396294979905001, missing=nan,
+              monotone_constraints='()', n_estimators=540, n_jobs=4,
+              num_parallel_tree=1, random_state=0,
+              reg_alpha=0.0012362430984376035, reg_lambda=3.093428791531145,
+              scale_pos_weight=1, subsample=1.0, tree_method='hist',
+              use_label_encoder=False, validate_parameters=1, verbosity=0)
+```
diff --git a/website/versioned_docs/version-1.0.4/Examples/Integrate - AzureML.md b/website/versioned_docs/version-1.0.4/Examples/Integrate - AzureML.md
new file mode 100644
index 0000000000..4d9db90880
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Integrate - AzureML.md	
@@ -0,0 +1,168 @@
+FLAML can be used together with AzureML. On top of that, using mlflow and ray is easy too.
+
+### Prerequisites
+
+Install the [azureml] option.
+```bash
+pip install "flaml[azureml]"
+```
+
+Setup a AzureML workspace:
+```python
+from azureml.core import Workspace
+
+ws = Workspace.create(name='myworkspace', subscription_id='<azure-subscription-id>', resource_group='myresourcegroup')
+```
+
+### Enable mlflow in AzureML workspace
+
+```python
+import mlflow
+from azureml.core import Workspace
+
+ws = Workspace.from_config()
+mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
+```
+
+### Start an AutoML run
+
+```python
+from flaml.automl.data import load_openml_dataset
+from flaml import AutoML
+
+# Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure.
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
+
+automl = AutoML()
+settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": "accuracy",  # metric to optimize
+    "task": "classification",  # task type
+    "log_file_name": "airlines_experiment.log",  # flaml log file
+}
+experiment = mlflow.set_experiment("flaml")  # the experiment name in AzureML workspace
+with mlflow.start_run() as run:  # create a mlflow run
+    automl.fit(X_train=X_train, y_train=y_train, **settings)
+    mlflow.sklearn.log_model(automl, "automl")
+```
+
+The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace. They can be retrieved by `mlflow.search_runs`:
+
+```python
+mlflow.search_runs(experiment_ids=[experiment.experiment_id], filter_string="params.learner = 'xgboost'")
+```
+
+The logged model can be loaded and used to make predictions:
+```python
+automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl")
+print(automl.predict(X_test))
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
+
+### Use ray to distribute across a cluster
+
+When you have a compute cluster in AzureML, you can distribute `flaml.AutoML` or `flaml.tune` with ray.
+
+#### Build a ray environment in AzureML
+
+Create a docker file such as [.Docker/Dockerfile-cpu](https://github.com/microsoft/FLAML/blob/main/test/.Docker/Dockerfile-cpu). Make sure `RUN pip install flaml[blendsearch,ray]` is included in the docker file.
+
+Then build a AzureML environment in the workspace `ws`.
+
+```python
+ray_environment_name = "aml-ray-cpu"
+ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
+
+# Build CPU image for Ray
+ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
+ray_cpu_env.register(workspace=ws)
+ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
+
+import time
+while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
+    print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
+    time.sleep(10)
+```
+
+You only need to do this step once for one workspace.
+
+#### Create a compute cluster with multiple nodes
+
+```python
+from azureml.core.compute import AmlCompute, ComputeTarget
+
+compute_target_name = "cpucluster"
+node_count = 2
+
+# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
+compute_target_size = "STANDARD_D2_V2"
+
+if compute_target_name in ws.compute_targets:
+    compute_target = ws.compute_targets[compute_target_name]
+    if compute_target and type(compute_target) is AmlCompute:
+        if compute_target.provisioning_state == "Succeeded":
+            print("Found compute target; using it:", compute_target_name)
+        else:
+            raise Exception(
+                "Found compute target but it is in state", compute_target.provisioning_state)
+else:
+    print("creating a new compute target...")
+    provisioning_config = AmlCompute.provisioning_configuration(
+        vm_size=compute_target_size,
+        min_nodes=0,
+        max_nodes=node_count)
+
+    # Create the cluster
+    compute_target = ComputeTarget.create(ws, compute_target_name, provisioning_config)
+
+    # Can poll for a minimum number of nodes and for a specific timeout.
+    # If no min node count is provided it will use the scale settings for the cluster
+    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
+
+    # For a more detailed view of current AmlCompute status, use get_status()
+    print(compute_target.get_status().serialize())
+```
+
+If the computer target "cpucluster" already exists, it will not be recreated.
+
+#### Run distributed AutoML job
+
+Assuming you have an automl script like [ray/distribute_automl.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_automl.py). It uses `n_concurrent_trials=k` to inform `AutoML.fit()` to perform k concurrent trials in parallel.
+
+Submit an AzureML job as the following:
+
+```python
+from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment
+from azureml.core.runconfig import RunConfiguration, DockerConfiguration
+
+command = ["python distribute_automl.py"]
+ray_environment_name = "aml-ray-cpu"
+env = Environment.get(workspace=ws, name=ray_environment_name)
+aml_run_config = RunConfiguration(communicator="OpenMpi")
+aml_run_config.target = compute_target
+aml_run_config.docker = DockerConfiguration(use_docker=True)
+aml_run_config.environment = env
+aml_run_config.node_count = 2
+config = ScriptRunConfig(
+    source_directory="ray/",
+    command=command,
+    run_config=aml_run_config,
+)
+
+exp = Experiment(ws, "distribute-automl")
+run = exp.submit(config)
+
+print(run.get_portal_url())  # link to ml.azure.com
+run.wait_for_completion(show_output=True)
+```
+
+#### Run distributed tune job
+
+Prepare a script like [ray/distribute_tune.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_tune.py). Replace the command in the above eample with:
+
+```python
+command = ["python distribute_tune.py"]
+```
+
+Everything else is the same.
diff --git a/website/versioned_docs/version-1.0.4/Examples/Integrate - Scikit-learn Pipeline.md b/website/versioned_docs/version-1.0.4/Examples/Integrate - Scikit-learn Pipeline.md
new file mode 100644
index 0000000000..ee32ebd0d6
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Integrate - Scikit-learn Pipeline.md	
@@ -0,0 +1,67 @@
+As FLAML's AutoML module can be used a transformer in the Sklearn's pipeline we can get all the benefits of pipeline.
+
+This example requires openml==0.10.2.
+
+### Load data
+
+```python
+from flaml.automl.data import load_openml_dataset
+
+# Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure.
+X_train, X_test, y_train, y_test = load_openml_dataset(
+    dataset_id=1169, data_dir='./', random_state=1234, dataset_format='array')
+```
+
+### Create a pipeline
+
+```python
+from sklearn import set_config
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from flaml import AutoML
+
+set_config(display='diagram')
+
+imputer = SimpleImputer()
+standardizer = StandardScaler()
+automl = AutoML()
+
+automl_pipeline = Pipeline([
+    ("imputuer",imputer),
+    ("standardizer", standardizer),
+    ("automl", automl)
+])
+automl_pipeline
+```
+
+![png](images/pipeline.png)
+
+### Run AutoML in the pipeline
+
+```python
+automl_settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy', 'roc_auc', 'roc_auc_weighted', 'roc_auc_ovr', 'roc_auc_ovo', 'f1', 'log_loss', 'mae', 'mse', 'r2'] Check the documentation for more details (https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric)
+    "task": "classification",  # task type
+    "estimator_list": ["xgboost", "catboost", "lgbm"],
+    "log_file_name": "airlines_experiment.log",  # flaml log file
+}
+pipeline_settings = {
+    f"automl__{key}": value for key, value in automl_settings.items()
+}
+automl_pipeline.fit(X_train, y_train, **pipeline_settings)
+```
+
+### Get the automl object from the pipeline
+
+```python
+automl = automl_pipeline.steps[2][1]
+# Get the best config and best learner
+print('Best ML leaner:', automl.best_estimator)
+print('Best hyperparmeter config:', automl.best_config)
+print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
+print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_sklearn.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_sklearn.ipynb)
diff --git a/website/versioned_docs/version-1.0.4/Examples/Tune-AzureML-pipeline.md b/website/versioned_docs/version-1.0.4/Examples/Tune-AzureML-pipeline.md
new file mode 100644
index 0000000000..8954ae4cc7
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Tune-AzureML-pipeline.md
@@ -0,0 +1,216 @@
+# Tune - AzureML pipeline
+
+This example uses flaml to tune an Azure ML pipeline that fits a lightgbm classifier on the [sklearn breast cancer dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)).
+If you already have an Azure ML pipeline, you can use the approach to tune your pipeline with flaml.
+
+## Prepare for tuning
+
+### Requirements
+
+We recommend using conda or venv to create a virtual env to install the dependencies.
+
+```bash
+# set up new conda environment
+conda create -n pipeline_tune python=3.8 pip=20.2 -y
+conda activate pipeline_tune
+
+# install azureml packages for runnig AzureML pipelines
+pip install azureml-core==1.39.0
+pip install azure-ml-component[notebooks]==0.9.10.post1
+pip install azureml-dataset-runtime==1.39.0
+
+# install hydra-core for passing AzureML pipeline parameters
+pip install hydra-core==1.1.1
+
+# install flaml
+pip install flaml[blendsearch,ray]==1.0.9
+```
+
+### Azure ML training pipeline
+
+Before we are ready for tuning, we must first have an Azure ML pipeline.
+In this example, we use the following toy pipeline for illustration.
+The pipeline consists of two steps: (1) data preparation and (2) model training.
+
+![png](images/AzureML_train_pipeline.png).
+
+The [code example](https://github.com/microsoft/FLAML/tree/main/test/pipeline_tuning_example) discussed in the page is included in
+`test/pipeline_tuning_example/`.
+We will use the relative path in the rest of the page.
+
+### Data
+
+The example data exsits in `data/data.csv`.
+It will be uploaded to AzureML workspace to be consumed by the training pipeline
+using the following code.
+
+```python
+Dataset.File.upload_directory(
+    src_dir=to_absolute_path(LOCAL_DIR / "data"),
+    target=(datastore, "classification_data"),
+    overwrite=True,
+)
+
+dataset = Dataset.File.from_files(path=(datastore, 'classification_data'))
+```
+
+### Configurations for the pipeline
+
+The pipeline configuration is defined in
+`configs/train_config.yaml`.
+
+```yaml
+hydra:
+  searchpath:
+    - file://.
+
+aml_config:
+  workspace_name: your_workspace_name
+  resource_group: your_resource_group
+  subscription_id: your_subscription_id
+  cpu_target: cpucluster
+
+train_config:
+  exp_name: sklearn_breast_cancer_classification
+  test_train_ratio: 0.4
+  learning_rate: 0.05
+  n_estimators: 50
+```
+
+### Define and submit the pipeline
+
+The pipeline was defined in
+`submit_train_pipeline.py`.
+
+To submit the pipeline, please specify your AzureML resources
+in the `configs/train_config.yaml` and run
+
+```bash
+cd test/pipeline_tuning_example
+python submit_train_pipeline.py
+```
+
+To get the pipeline ready for HPO, in the training step,
+we need to log the metrics of interest to AzureML using
+
+```python
+run.log(f"{data_name}_{eval_name}", result)
+```
+
+## Hyperparameter Optimization
+
+We are now ready to set up the HPO job for the AzureML pipeline, including:
+
+- config the HPO job,
+- set up the interaction between the HPO job and the training job.
+
+These two steps are done in `tuner/tuner_func.py`.
+
+### Set up the tune job
+
+`tuner_func.tune_pipeline` sets up the search space, metric to optimize, mode, etc.
+
+```python
+def tune_pipeline(concurrent_run=1):
+    start_time = time.time()
+
+    # config the HPO job
+    search_space = {
+        "train_config.n_estimators": flaml.tune.randint(50, 200),
+        "train_config.learning_rate": flaml.tune.uniform(0.01, 0.5),
+    }
+
+    hp_metric = "eval_binary_error"
+    mode = "max"
+    num_samples = 2
+
+
+    if concurrent_run > 1:
+        import ray  # For parallel tuning
+
+        ray.init(num_cpus=concurrent_run)
+        use_ray = True
+    else:
+        use_ray = False
+
+    # launch the HPO job
+    analysis = flaml.tune.run(
+        run_with_config,
+        config=search_space,
+        metric=hp_metric,
+        mode=mode,
+        num_samples=num_samples,  # number of trials
+        use_ray=use_ray,
+    )
+
+    # get the best config
+    best_trial = analysis.get_best_trial(hp_metric, mode, "all")
+    metric = best_trial.metric_analysis[hp_metric][mode]
+    print(f"n_trials={len(analysis.trials)}")
+    print(f"time={time.time()-start_time}")
+    print(f"Best {hp_metric}: {metric:.4f}")
+    print(f"Best coonfiguration: {best_trial.config}")
+```
+
+### Interact with AzureML pipeline jobs
+
+The interaction between FLAML and AzureML pipeline jobs is in `tuner_func.run_with_config`.
+
+```python
+def run_with_config(config: dict):
+    """Run the pipeline with a given config dict
+    """
+
+    # pass the hyperparameters to AzureML jobs by overwriting the config file.
+    overrides = [f"{key}={value}" for key, value in config.items()]
+
+    print(overrides)
+    run = submit_train_pipeline.build_and_submit_aml_pipeline(overrides)
+
+    print(run.get_portal_url())
+
+    # retrieving the metrics to optimize before the job completes.
+    stop = False
+    while not stop:
+        # get status
+        status = run._core_run.get_status()
+        print(f'status: {status}')
+
+        # get metrics
+        metrics = run._core_run.get_metrics(recursive=True)
+        if metrics:
+            run_metrics = list(metrics.values())
+
+            new_metric = run_metrics[0]['eval_binary_error']
+
+            if type(new_metric) == list:
+                new_metric = new_metric[-1]
+
+            print(f'eval_binary_error: {new_metric}')
+
+            tune.report(eval_binary_error=new_metric)
+
+        time.sleep(5)
+
+        if status == 'FAILED' or status == 'Completed':
+            stop = True
+
+    print("The run is terminated.")
+    print(status)
+
+    return
+```
+
+Overall, to tune the hyperparameters of the AzureML pipeline, run:
+
+```bash
+# the training job will run remotely as an AzureML job in both choices
+# run the tuning job locally
+python submit_tune.py --local
+# run the tuning job remotely
+python submit_tune.py --remote --subscription_id <your subscription_id> --resource_group <your resource_group> --workspace <your workspace>
+```
+
+The local option runs the `tuner/tuner_func.py` in your local machine.
+The remote option wraps up the `tuner/tuner_func.py` as an AzureML component and
+starts another AzureML job to tune the AzureML pipeline.
diff --git a/website/versioned_docs/version-1.0.4/Examples/Tune-HuggingFace.md b/website/versioned_docs/version-1.0.4/Examples/Tune-HuggingFace.md
new file mode 100644
index 0000000000..25b5e13ce2
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Tune-HuggingFace.md
@@ -0,0 +1,191 @@
+# Tune - HuggingFace
+
+This example uses flaml to finetune a transformer model from Huggingface transformers library.
+
+*Note*: `flaml.AutoML` has built-in support for certain finetuning tasks with a
+[higher-level API](AutoML-NLP).
+It may be easier to use that API unless you have special requirements not handled by that API.
+
+### Requirements
+
+This example requires GPU. Install dependencies:
+```python
+pip install torch transformers datasets "flaml[blendsearch,ray]"
+```
+
+### Prepare for tuning
+
+#### Tokenizer
+
+```python
+from transformers import AutoTokenizer
+
+MODEL_NAME = "distilbert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+COLUMN_NAME = "sentence"
+
+def tokenize(examples):
+    return tokenizer(examples[COLUMN_NAME], truncation=True)
+```
+
+#### Define training method
+
+```python
+import flaml
+import datasets
+from transformers import AutoModelForSequenceClassification
+
+TASK = "cola"
+NUM_LABELS = 2
+
+def train_distilbert(config: dict):
+    # Load CoLA dataset and apply tokenizer
+    cola_raw = datasets.load_dataset("glue", TASK)
+    cola_encoded = cola_raw.map(tokenize, batched=True)
+    train_dataset, eval_dataset = cola_encoded["train"], cola_encoded["validation"]
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL_NAME, num_labels=NUM_LABELS
+    )
+    metric = datasets.load_metric("glue", TASK)
+
+    def compute_metrics(eval_pred):
+        predictions, labels = eval_pred
+        predictions = np.argmax(predictions, axis=1)
+        return metric.compute(predictions=predictions, references=labels)
+
+    training_args = TrainingArguments(
+        output_dir='.',
+        do_eval=False,
+        disable_tqdm=True,
+        logging_steps=20000,
+        save_total_limit=0,
+        **config,
+    )
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+    )
+
+    # train model
+    trainer.train()
+
+    # evaluate model
+    eval_output = trainer.evaluate()
+
+    # report the metric to optimize & the metric to log
+    flaml.tune.report(
+        loss=eval_output["eval_loss"],
+        matthews_correlation=eval_output["eval_matthews_correlation"],
+    )
+```
+
+### Define the search
+
+We are now ready to define our search. This includes:
+
+- The `search_space` for our hyperparameters
+- The `metric` and the `mode` ('max' or 'min') for optimization
+- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)
+
+```python
+max_num_epoch = 64
+search_space = {
+        # You can mix constants with search space objects.
+        "num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
+        "learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
+        "adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
+        "adam_beta1": flaml.tune.uniform(0.8, 0.99),
+        "adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
+}
+
+# optimization objective
+HP_METRIC, MODE = "matthews_correlation", "max"
+
+# resources
+num_cpus = 4
+num_gpus = 4  # change according to your GPU resources
+
+# constraints
+num_samples = -1  # number of trials, -1 means unlimited
+time_budget_s = 3600  # time budget in seconds
+```
+
+### Launch the tuning
+
+We are now ready to launch the tuning using `flaml.tune.run`:
+
+```python
+import ray
+
+ray.init(num_cpus=num_cpus, num_gpus=num_gpus)
+print("Tuning started...")
+analysis = flaml.tune.run(
+    train_distilbert,
+    search_alg=flaml.CFO(
+        space=search_space,
+        metric=HP_METRIC,
+        mode=MODE,
+        low_cost_partial_config={"num_train_epochs": 1}),
+    resources_per_trial={"gpu": num_gpus, "cpu": num_cpus},
+    local_dir='logs/',
+    num_samples=num_samples,
+    time_budget_s=time_budget_s,
+    use_ray=True,
+)
+```
+
+This will run tuning for one hour. At the end we will see a summary.
+```
+== Status ==
+Memory usage on this node: 32.0/251.6 GiB
+Using FIFO scheduling algorithm.
+Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)
+Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58
+Number of trials: 22/infinite (22 TERMINATED)
+Trial name	status	loc	adam_beta1	adam_beta2	adam_epsilon	learning_rate	num_train_epochs	iter	total time (s)	loss	matthews_correlation
+train_distilbert_a0c303d0	TERMINATED		0.939079	0.991865	7.96945e-08	5.61152e-06	1	1	55.6909	0.587986	0
+train_distilbert_a0c303d1	TERMINATED		0.811036	0.997214	2.05111e-09	2.05134e-06	1.44427	1	71.7663	0.603018	0
+train_distilbert_c39b2ef0	TERMINATED		0.909395	0.993715	1e-07	5.26543e-06	1	1	53.7619	0.586518	0
+train_distilbert_f00776e2	TERMINATED		0.968763	0.990019	4.38943e-08	5.98035e-06	1.02723	1	56.8382	0.581313	0
+train_distilbert_11ab3900	TERMINATED		0.962198	0.991838	7.09296e-08	5.06608e-06	1	1	54.0231	0.585576	0
+train_distilbert_353025b6	TERMINATED		0.91596	0.991892	8.95426e-08	6.21568e-06	2.15443	1	98.3233	0.531632	0.388893
+train_distilbert_5728a1de	TERMINATED		0.926933	0.993146	1e-07	1.00902e-05	1	1	55.3726	0.538505	0.280558
+train_distilbert_9394c2e2	TERMINATED		0.928106	0.990614	4.49975e-08	3.45674e-06	2.72935	1	121.388	0.539177	0.327295
+train_distilbert_b6543fec	TERMINATED		0.876896	0.992098	1e-07	7.01176e-06	1.59538	1	76.0244	0.527516	0.379177
+train_distilbert_0071f998	TERMINATED		0.955024	0.991687	7.39776e-08	5.50998e-06	2.90939	1	126.871	0.516225	0.417157
+train_distilbert_2f830be6	TERMINATED		0.886931	0.989628	7.6127e-08	4.37646e-06	1.53338	1	73.8934	0.551629	0.0655887
+train_distilbert_7ce03f12	TERMINATED		0.984053	0.993956	8.70144e-08	7.82557e-06	4.08775	1	174.027	0.523732	0.453549
+train_distilbert_aaab0508	TERMINATED		0.940707	0.993946	1e-07	8.91979e-06	3.40243	1	146.249	0.511288	0.45085
+train_distilbert_14262454	TERMINATED		0.99	0.991696	4.60093e-08	4.83405e-06	3.4954	1	152.008	0.53506	0.400851
+train_distilbert_6d211fe6	TERMINATED		0.959277	0.994556	5.40791e-08	1.17333e-05	6.64995	1	271.444	0.609851	0.526802
+train_distilbert_c980bae4	TERMINATED		0.99	0.993355	1e-07	5.21929e-06	2.51275	1	111.799	0.542276	0.324968
+train_distilbert_6d0d29d6	TERMINATED		0.965773	0.995182	9.9752e-08	1.15549e-05	13.694	1	527.944	0.923802	0.549474
+train_distilbert_b16ea82a	TERMINATED		0.952781	0.993931	2.93182e-08	1.19145e-05	3.2293	1	139.844	0.533466	0.451307
+train_distilbert_eddf7cc0	TERMINATED		0.99	0.997109	8.13498e-08	1.28515e-05	15.5807	1	614.789	0.983285	0.56993
+train_distilbert_43008974	TERMINATED		0.929089	0.993258	1e-07	1.03892e-05	12.0357	1	474.387	0.857461	0.520022
+train_distilbert_b3408a4e	TERMINATED		0.99	0.993809	4.67441e-08	1.10418e-05	11.9165	1	474.126	0.828205	0.526164
+train_distilbert_cfbfb220	TERMINATED		0.979454	0.9999	1e-07	1.49578e-05	20.3715
+```
+
+### Retrieve the results
+
+```python
+best_trial = analysis.get_best_trial(HP_METRIC, MODE, "all")
+metric = best_trial.metric_analysis[HP_METRIC][MODE]
+print(f"n_trials={len(analysis.trials)}")
+print(f"time={time.time()-start_time}")
+print(f"Best model eval {HP_METRIC}: {metric:.4f}")
+print(f"Best model parameters: {best_trial.config}")
+# n_trials=22
+# time=3999.769361972809
+# Best model eval matthews_correlation: 0.5699
+# Best model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_huggingface.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_huggingface.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/Tune-Lexicographic-objectives.md b/website/versioned_docs/version-1.0.4/Examples/Tune-Lexicographic-objectives.md
new file mode 100644
index 0000000000..b215c37282
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Tune-Lexicographic-objectives.md
@@ -0,0 +1,166 @@
+# Tune - Lexicographic Objectives
+
+## Requirements
+
+```python
+pip install "flaml>=1.1.0" thop torchvision torch
+```
+Tuning multiple objectives with Lexicographic preference is a new feature added in version 1.1.0 and is subject to change in future versions.
+
+## Tuning accurate and efficient neural networks with lexicographic preference
+
+### Data
+
+```python
+import torch
+import thop
+import torch.nn as nn
+from flaml import tune
+import torch.nn.functional as F
+import torchvision
+import numpy as np
+import os
+
+DEVICE = torch.device("cpu")
+BATCHSIZE = 128
+N_TRAIN_EXAMPLES = BATCHSIZE * 30
+N_VALID_EXAMPLES = BATCHSIZE * 10
+data_dir = os.path.abspath("data")
+
+train_dataset = torchvision.datasets.FashionMNIST(
+    data_dir,
+    train=True,
+    download=True,
+    transform=torchvision.transforms.ToTensor(),
+)
+
+train_loader = torch.utils.data.DataLoader(
+    torch.utils.data.Subset(train_dataset, list(range(N_TRAIN_EXAMPLES))),
+    batch_size=BATCHSIZE,
+    shuffle=True,
+)
+
+val_dataset = torchvision.datasets.FashionMNIST(
+    data_dir, train=False, transform=torchvision.transforms.ToTensor()
+)
+
+val_loader = torch.utils.data.DataLoader(
+    torch.utils.data.Subset(val_dataset, list(range(N_VALID_EXAMPLES))),
+    batch_size=BATCHSIZE,
+    shuffle=True,
+```
+
+### Specific the model
+
+```python
+def define_model(configuration):
+    n_layers = configuration["n_layers"]
+    layers = []
+    in_features = 28 * 28
+    for i in range(n_layers):
+        out_features = configuration["n_units_l{}".format(i)]
+        layers.append(nn.Linear(in_features, out_features))
+        layers.append(nn.ReLU())
+        p = configuration["dropout_{}".format(i)]
+        layers.append(nn.Dropout(p))
+        in_features = out_features
+    layers.append(nn.Linear(in_features, 10))
+    layers.append(nn.LogSoftmax(dim=1))
+    return nn.Sequential(*layers)
+```
+
+### Train
+
+```python
+def train_model(model, optimizer, train_loader):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)
+        optimizer.zero_grad()
+        F.nll_loss(model(data), target).backward()
+        optimizer.step()
+```
+
+### Metrics
+
+```python
+def eval_model(model, valid_loader):
+    model.eval()
+    correct = 0
+    with torch.no_grad():
+        for batch_idx, (data, target) in enumerate(valid_loader):
+            data, target = data.view(-1, 28 * 28).to(DEVICE), target.to(DEVICE)
+            pred = model(data).argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    accuracy = correct / N_VALID_EXAMPLES
+    flops, params = thop.profile(
+        model, inputs=(torch.randn(1, 28 * 28).to(DEVICE),), verbose=False
+    )
+    return np.log2(flops), 1 - accuracy, params
+```
+
+
+
+### Evaluation function
+
+```python
+def evaluate_function(configuration):
+    model = define_model(configuration).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), configuration["lr"])
+    n_epoch = configuration["n_epoch"]
+    for epoch in range(n_epoch):
+        train_model(model, optimizer, train_loader)
+    flops, error_rate, params = eval_model(model, val_loader)
+    return {"error_rate": error_rate, "flops": flops, "params": params}
+```
+
+### Search space
+```python
+search_space = {
+    "n_layers": tune.randint(lower=1, upper=3),
+    "n_units_l0": tune.randint(lower=4, upper=128),
+    "n_units_l1": tune.randint(lower=4, upper=128),
+    "n_units_l2": tune.randint(lower=4, upper=128),
+    "dropout_0": tune.uniform(lower=0.2, upper=0.5),
+    "dropout_1": tune.uniform(lower=0.2, upper=0.5),
+    "dropout_2": tune.uniform(lower=0.2, upper=0.5),
+    "lr": tune.loguniform(lower=1e-5, upper=1e-1),
+    "n_epoch": tune.randint(lower=1, upper=20),
+}
+```
+
+### Launch the tuning process
+
+```python
+
+# Low cost initial point
+low_cost_partial_config = {
+    "n_layers": 1,
+    "n_units_l0": 4,
+    "n_units_l1": 4,
+    "n_units_l2": 4,
+    "n_epoch": 1,
+}
+
+# Specific lexicographic preference
+lexico_objectives = {}
+lexico_objectives["metrics"] = ["error_rate", "flops"]
+lexico_objectives["tolerances"] = {"error_rate": 0.02, "flops": 0.0}
+lexico_objectives["targets"] = {"error_rate": 0.0, "flops": 0.0}
+lexico_objectives["modes"] = ["min", "min"]
+
+# launch the tuning process
+analysis = tune.run(
+    evaluate_function,
+    num_samples=-1,
+    time_budget_s=100,
+    config=search_space, # search space of NN
+    use_ray=False,
+    lexico_objectives=lexico_objectives,
+    low_cost_partial_config=low_cost_partial_config, # low cost initial point
+)
+```
+
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb)
diff --git a/website/versioned_docs/version-1.0.4/Examples/Tune-PyTorch.md b/website/versioned_docs/version-1.0.4/Examples/Tune-PyTorch.md
new file mode 100644
index 0000000000..83f38e6098
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Examples/Tune-PyTorch.md
@@ -0,0 +1,286 @@
+# Tune - PyTorch
+
+This example uses flaml to tune a pytorch model on CIFAR10.
+
+## Prepare for tuning
+
+### Requirements
+```bash
+pip install torchvision "flaml[blendsearch,ray]"
+```
+
+Before we are ready for tuning, we first need to define the neural network that we would like to tune.
+
+### Network Specification
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import random_split
+import torchvision
+import torchvision.transforms as transforms
+
+
+class Net(nn.Module):
+
+    def __init__(self, l1=120, l2=84):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, l1)
+        self.fc2 = nn.Linear(l1, l2)
+        self.fc3 = nn.Linear(l2, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+```
+
+### Data
+
+```python
+def load_data(data_dir="data"):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform)
+
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform)
+
+    return trainset, testset
+```
+
+### Training
+
+```python
+from ray import tune
+
+def train_cifar(config, checkpoint_dir=None, data_dir=None):
+    if "l1" not in config:
+        logger.warning(config)
+    net = Net(2**config["l1"], 2**config["l2"])
+
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if torch.cuda.device_count() > 1:
+            net = nn.DataParallel(net)
+    net.to(device)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+
+    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
+    # should be restored.
+    if checkpoint_dir:
+        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
+        model_state, optimizer_state = torch.load(checkpoint)
+        net.load_state_dict(model_state)
+        optimizer.load_state_dict(optimizer_state)
+
+    trainset, testset = load_data(data_dir)
+
+    test_abs = int(len(trainset) * 0.8)
+    train_subset, val_subset = random_split(
+        trainset, [test_abs, len(trainset) - test_abs])
+
+    trainloader = torch.utils.data.DataLoader(
+        train_subset,
+        batch_size=int(2**config["batch_size"]),
+        shuffle=True,
+        num_workers=4)
+    valloader = torch.utils.data.DataLoader(
+        val_subset,
+        batch_size=int(2**config["batch_size"]),
+        shuffle=True,
+        num_workers=4)
+
+    for epoch in range(int(round(config["num_epochs"]))):  # loop over the dataset multiple times
+        running_loss = 0.0
+        epoch_steps = 0
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            epoch_steps += 1
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
+                                                running_loss / epoch_steps))
+                running_loss = 0.0
+
+        # Validation loss
+        val_loss = 0.0
+        val_steps = 0
+        total = 0
+        correct = 0
+        for i, data in enumerate(valloader, 0):
+            with torch.no_grad():
+                inputs, labels = data
+                inputs, labels = inputs.to(device), labels.to(device)
+
+                outputs = net(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+
+                loss = criterion(outputs, labels)
+                val_loss += loss.cpu().numpy()
+                val_steps += 1
+
+        # Here we save a checkpoint. It is automatically registered with
+        # Ray Tune and will potentially be passed as the `checkpoint_dir`
+        # parameter in future iterations.
+        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
+            path = os.path.join(checkpoint_dir, "checkpoint")
+            torch.save(
+                (net.state_dict(), optimizer.state_dict()), path)
+
+        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
+    print("Finished Training")
+```
+
+### Test Accuracy
+
+```python
+def _test_accuracy(net, device="cpu"):
+    trainset, testset = load_data()
+
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=4, shuffle=False, num_workers=2)
+
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(device), labels.to(device)
+            outputs = net(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    return correct / total
+```
+
+## Hyperparameter Optimization
+
+```python
+import numpy as np
+import flaml
+import os
+
+data_dir = os.path.abspath("data")
+load_data(data_dir)  # Download data for all trials before starting the run
+```
+
+### Search space
+
+```python
+max_num_epoch = 100
+config = {
+    "l1": tune.randint(2, 9),   # log transformed with base 2
+    "l2": tune.randint(2, 9),   # log transformed with base 2
+    "lr": tune.loguniform(1e-4, 1e-1),
+    "num_epochs": tune.loguniform(1, max_num_epoch),
+    "batch_size": tune.randint(1, 5)    # log transformed with base 2
+}
+```
+
+### Budget and resource constraints
+
+```python
+time_budget_s = 600     # time budget in seconds
+gpus_per_trial = 0.5    # number of gpus for each trial; 0.5 means two training jobs can share one gpu
+num_samples = 500       # maximal number of trials
+np.random.seed(7654321)
+```
+
+### Launch the tuning
+
+```python
+import time
+start_time = time.time()
+result = flaml.tune.run(
+    tune.with_parameters(train_cifar, data_dir=data_dir),
+    config=config,
+    metric="loss",
+    mode="min",
+    low_cost_partial_config={"num_epochs": 1},
+    max_resource=max_num_epoch,
+    min_resource=1,
+    scheduler="asha",  # Use asha scheduler to perform early stopping based on intermediate results reported
+    resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
+    local_dir='logs/',
+    num_samples=num_samples,
+    time_budget_s=time_budget_s,
+    use_ray=True)
+```
+
+### Check the result
+
+```python
+print(f"#trials={len(result.trials)}")
+print(f"time={time.time()-start_time}")
+best_trial = result.get_best_trial("loss", "min", "all")
+print("Best trial config: {}".format(best_trial.config))
+print("Best trial final validation loss: {}".format(
+    best_trial.metric_analysis["loss"]["min"]))
+print("Best trial final validation accuracy: {}".format(
+    best_trial.metric_analysis["accuracy"]["max"]))
+
+best_trained_model = Net(2**best_trial.config["l1"],
+                         2**best_trial.config["l2"])
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda:0"
+    if gpus_per_trial > 1:
+        best_trained_model = nn.DataParallel(best_trained_model)
+best_trained_model.to(device)
+
+checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
+
+model_state, optimizer_state = torch.load(checkpoint_path)
+best_trained_model.load_state_dict(model_state)
+
+test_acc = _test_accuracy(best_trained_model, device)
+print("Best trial test set accuracy: {}".format(test_acc))
+```
+
+### Sample of output
+
+```
+#trials=44
+time=1193.913584947586
+Best trial config: {'l1': 8, 'l2': 8, 'lr': 0.0008818671030627281, 'num_epochs': 55.9513429004283, 'batch_size': 3}
+Best trial final validation loss: 1.0694482081472874
+Best trial final validation accuracy: 0.6389
+Files already downloaded and verified
+Files already downloaded and verified
+Best trial test set accuracy: 0.6294
+```
+
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb)
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/AzureML_train_pipeline.png b/website/versioned_docs/version-1.0.4/Examples/images/AzureML_train_pipeline.png
new file mode 100644
index 0000000000..d20df6ead9
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/AzureML_train_pipeline.png differ
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/CO2.png b/website/versioned_docs/version-1.0.4/Examples/images/CO2.png
new file mode 100644
index 0000000000..684df085c8
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/CO2.png differ
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/lgbm_curve.png b/website/versioned_docs/version-1.0.4/Examples/images/lgbm_curve.png
new file mode 100644
index 0000000000..8ef8365f57
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/lgbm_curve.png differ
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/pipeline.png b/website/versioned_docs/version-1.0.4/Examples/images/pipeline.png
new file mode 100644
index 0000000000..2488f4e1d5
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/pipeline.png differ
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/xgb_curve.png b/website/versioned_docs/version-1.0.4/Examples/images/xgb_curve.png
new file mode 100644
index 0000000000..29ff34cf16
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/xgb_curve.png differ
diff --git a/website/versioned_docs/version-1.0.4/Examples/images/xgb_feature_importance.png b/website/versioned_docs/version-1.0.4/Examples/images/xgb_feature_importance.png
new file mode 100644
index 0000000000..c4cef1b3d7
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Examples/images/xgb_feature_importance.png differ
diff --git a/website/versioned_docs/version-1.0.4/FAQ.md b/website/versioned_docs/version-1.0.4/FAQ.md
new file mode 100644
index 0000000000..232e390c05
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/FAQ.md
@@ -0,0 +1,81 @@
+# Frequently Asked Questions
+
+### [Guidelines on how to set a hyperparameter search space](Use-Cases/Tune-User-Defined-Function#details-and-guidelines-on-hyperparameter-search-space)
+
+### [Guidelines on parallel vs seqential tuning](Use-Cases/Task-Oriented-AutoML#guidelines-on-parallel-vs-sequential-tuning)
+
+### [Guidelines on creating and tuning a custom estimator](Use-Cases/Task-Oriented-AutoML#guidelines-on-tuning-a-custom-estimator)
+
+
+### About `low_cost_partial_config` in `tune`.
+
+- Definition and purpose: The `low_cost_partial_config` is a dictionary of subset of the hyperparameter coordinates whose value corresponds to a configuration with known low-cost (i.e., low computation cost for training the corresponding model).  The concept of low/high-cost is meaningful in the case where a subset of the hyperparameters to tune directly affects the computation cost for training the model. For example, `n_estimators` and `max_leaves` are known to affect the training cost of tree-based learners. We call this subset of hyperparameters, *cost-related hyperparameters*. In such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are recommended to set them as the `low_cost_partial_config`. Using the tree-based method example again, since we know that small `n_estimators` and  `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that `4` is the lower bound of search space for these two hyperparameters), e.g., in [LGBM](https://github.com/microsoft/FLAML/blob/main/flaml/model.py#L215).  Configuring `low_cost_partial_config` helps the search algorithms make more cost-efficient choices.
+In AutoML, the `low_cost_init_value` in `search_space()` function for each estimator serves the same role.
+
+- Usage in practice: It is recommended to configure it if there are cost-related hyperparameters in your tuning task and you happen to know the low-cost values for them, but it is not required (It is fine to leave it the default value, i.e., `None`).
+
+- How does it work: `low_cost_partial_config` if configured, will be used as an initial point of the search. It also affects the search trajectory. For more details about how does it play a role in the search algorithms, please refer to the papers about the search algorithms used: Section 2 of [Frugal Optimization for Cost-related Hyperparameters (CFO)](https://arxiv.org/pdf/2005.01571.pdf) and Section 3 of [Economical Hyperparameter Optimization with Blended Search Strategy (BlendSearch)](https://openreview.net/pdf?id=VbLH04pRA3).
+
+
+### How does FLAML handle imbalanced data (unequal distribution of target classes in classification task)?
+
+Currently FLAML does several things for imbalanced data.
+
+1. When a class contains fewer than 20 examples, we repeatedly add these examples to the training data until the count is at least 20.
+2. We use stratified sampling when doing holdout and kf.
+3. We make sure no class is empty in both training and holdout data.
+4. We allow users to pass `sample_weight` to `AutoML.fit()`.
+5. User can customize the weight of each class by setting the `custom_hp` or `fit_kwargs_by_estimator` arguments. For example, the following code sets the weight for pos vs. neg as 2:1 for the RandomForest estimator:
+
+```python
+from flaml import AutoML
+from sklearn.datasets import load_iris
+
+X_train, y_train = load_iris(return_X_y=True)
+automl = AutoML()
+automl_settings = {
+    "time_budget": 2,
+    "task": "classification",
+    "log_file_name": "test/iris.log",
+    "estimator_list": ["rf", "xgboost"],
+}
+
+automl_settings["custom_hp"] = {
+    "xgboost": {
+        "scale_pos_weight": {
+            "domain": 0.5,
+            "init_value": 0.5,
+        }
+    },
+    "rf": {
+        "class_weight": {
+            "domain": "balanced",
+            "init_value": "balanced"
+        }
+    }
+}
+print(automl.model)
+```
+
+
+### How to interpret model performance? Is it possible for me to visualize feature importance, SHAP values, optimization history?
+
+You can use ```automl.model.estimator.feature_importances_``` to get the `feature_importances_` for the best model found by automl. See an [example](Examples/AutoML-for-XGBoost#plot-feature-importance).
+
+Packages such as `azureml-interpret` and `sklearn.inspection.permutation_importance` can be used on `automl.model.estimator` to explain the selected model.
+Model explanation is frequently asked and adding a native support may be a good feature. Suggestions/contributions are welcome.
+
+Optimization history can be checked from the [log](Use-Cases/Task-Oriented-AutoML#log-the-trials). You can also [retrieve the log and plot the learning curve](Use-Cases/Task-Oriented-AutoML#plot-learning-curve).
+
+
+### How to resolve out-of-memory error in `AutoML.fit()`
+
+* Set `free_mem_ratio` a float between 0 and 1. For example, 0.2 means try to keep free memory above 20% of total memory. Training may be early stopped for memory consumption reason when this is set.
+* Set `model_history` False.
+* If your data are already preprocessed, set `skip_transform` False. If you can preprocess the data before the fit starts, this setting can save memory needed for preprocessing in `fit`.
+* If the OOM error only happens for some particular trials:
+    - set `use_ray` True. This will increase the overhead per trial but can keep the AutoML process running when a single trial fails due to OOM error.
+    - provide a more accurate [`size`](reference/automl/model#size) function for the memory bytes consumption of each config for the estimator causing this error.
+    - modify the [search space](Use-Cases/Task-Oriented-AutoML#a-shortcut-to-override-the-search-space) for the estimators causing this error.
+    - or remove this estimator from the `estimator_list`.
+* If the OOM error happens when ensembling, consider disabling ensemble, or use a cheaper ensemble option. ([Example](Use-Cases/Task-Oriented-AutoML#ensemble)).
diff --git a/website/versioned_docs/version-1.0.4/Getting-Started.md b/website/versioned_docs/version-1.0.4/Getting-Started.md
new file mode 100644
index 0000000000..e3e828cbcd
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Getting-Started.md
@@ -0,0 +1,101 @@
+# Getting Started
+
+<!-- ### Welcome to FLAML, a Fast Library for Automated Machine Learning & Tuning! -->
+
+FLAML is a lightweight Python library that finds accurate machine
+learning models automatically, efficiently and economically. It frees users from selecting learners and hyperparameters for each learner.
+
+### Main Features
+
+1. For common machine learning tasks like classification and regression, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks.
+
+2. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code). Users can customize only when and what they need to, and leave the rest to the library.
+
+3. It supports fast and economical automatic tuning, capable of handling large search space with heterogeneous evaluation cost and complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
+hyperparameter optimization](Use-Cases/Tune-User-Defined-Function#hyperparameter-optimization-algorithm)
+and learner selection method invented by Microsoft Research.
+
+### Quickstart
+
+Install FLAML from pip: `pip install flaml`. Find more options in [Installation](Installation).
+
+There are several ways of using flaml:
+
+#### [Task-oriented AutoML](Use-Cases/task-oriented-automl)
+
+For example, with three lines of code, you can start using this economical and fast AutoML engine as a scikit-learn style estimator.
+
+```python
+from flaml import AutoML
+automl = AutoML()
+automl.fit(X_train, y_train, task="classification", time_budget=60)
+```
+
+It automatically tunes the hyperparameters and selects the best model from default learners such as LightGBM, XGBoost, random forest etc. for the specified time budget 60 seconds. [Customizing](Use-Cases/task-oriented-automl#customize-automlfit) the optimization metrics, learners and search spaces etc. is very easy. For example,
+
+```python
+automl.add_learner("mylgbm", MyLGBMEstimator)
+automl.fit(X_train, y_train, task="classification", metric=custom_metric, estimator_list=["mylgbm"], time_budget=60)
+```
+
+#### [Tune user-defined function](Use-Cases/Tune-User-Defined-Function)
+
+You can run generic hyperparameter tuning for a custom function (machine learning or beyond). For example,
+
+```python
+from flaml import tune
+from flaml.automl.model import LGBMEstimator
+
+
+def train_lgbm(config: dict) -> dict:
+    # convert config dict to lgbm params
+    params = LGBMEstimator(**config).params
+    # train the model
+    train_set = lightgbm.Dataset(csv_file_name)
+    model = lightgbm.train(params, train_set)
+    # evaluate the model
+    pred = model.predict(X_test)
+    mse = mean_squared_error(y_test, pred)
+    # return eval results as a dictionary
+    return {"mse": mse}
+
+
+# load a built-in search space from flaml
+flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
+# specify the search space as a dict from hp name to domain; you can define your own search space same way
+config_search_space = {hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()}
+# give guidance about hp values corresponding to low training cost, i.e., {"n_estimators": 4, "num_leaves": 4}
+low_cost_partial_config = {
+    hp: space["low_cost_init_value"]
+    for hp, space in flaml_lgbm_search_space.items()
+    if "low_cost_init_value" in space
+}
+# run the tuning, minimizing mse, with total time budget 3 seconds
+analysis = tune.run(
+    train_lgbm, metric="mse", mode="min", config=config_search_space,
+    low_cost_partial_config=low_cost_partial_config, time_budget_s=3, num_samples=-1,
+)
+```
+Please see this [script](https://github.com/microsoft/FLAML/blob/main/test/tune_example.py) for the complete version of the above example.
+
+#### [Zero-shot AutoML](Use-Cases/Zero-Shot-AutoML)
+
+FLAML offers a unique, seamless and effortless way to leverage AutoML for the commonly used classifiers and regressors such as LightGBM and XGBoost. For example, if you are using `lightgbm.LGBMClassifier` as your current learner, all you need to do is to replace `from lightgbm import LGBMClassifier` by:
+
+```python
+from flaml.default import LGBMClassifier
+```
+
+Then, you can use it just like you use the original `LGMBClassifier`. Your other code can remain unchanged. When you call the `fit()` function from `flaml.default.LGBMClassifier`, it will automatically instantiate a good data-dependent hyperparameter configuration for your dataset, which is expected to work better than the default configuration.
+
+### Where to Go Next?
+
+* Understand the use cases for [Task-oriented AutoML](Use-Cases/task-oriented-automl), [Tune user-defined function](Use-Cases/Tune-User-Defined-Function) and [Zero-shot AutoML](Use-Cases/Zero-Shot-AutoML).
+* Find code examples under "Examples": from [AutoML - Classification](Examples/AutoML-Classification) to [Tune - PyTorch](Examples/Tune-PyTorch).
+* Find [talks](https://www.youtube.com/channel/UCfU0zfFXHXdAd5x-WvFBk5A) and [tutorials](https://github.com/microsoft/FLAML/tree/tutorial/tutorial) about FLAML.
+* Learn about [research](Research) around FLAML.
+* Refer to [SDK](reference/automl/automl) and [FAQ](FAQ).
+
+If you like our project, please give it a [star](https://github.com/microsoft/FLAML/stargazers) on GitHub. If you are interested in contributing, please read [Contributor's Guide](Contribute).
+
+<iframe src="https://ghbtns.com/github-btn.html?user=microsoft&amp;repo=FLAML&amp;type=star&amp;count=true&amp;size=large" frameborder="0" scrolling="0" width="170" height="30" title="GitHub"></iframe>
diff --git a/website/versioned_docs/version-1.0.4/Installation.md b/website/versioned_docs/version-1.0.4/Installation.md
new file mode 100644
index 0000000000..7cc37943a1
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Installation.md
@@ -0,0 +1,88 @@
+# Installation
+
+## Python
+
+FLAML requires **Python version >= 3.7**. It can be installed from pip:
+
+```bash
+pip install flaml
+```
+
+or conda:
+```
+conda install flaml -c conda-forge
+```
+
+### Optional Dependencies
+
+#### Notebook
+
+To run the [notebook examples](https://github.com/microsoft/FLAML/tree/main/notebook),
+install flaml with the [notebook] option:
+
+```bash
+pip install flaml[notebook]
+```
+
+#### Extra learners
+
+* catboost
+```bash
+pip install flaml[catboost]
+```
+* vowpal wabbit
+```bash
+pip install flaml[vw]
+```
+* time series forecaster: prophet, statsmodels
+```bash
+pip install flaml[forecast]
+```
+
+* natural language processing: transformers
+```bash
+pip install flaml[nlp]
+```
+
+#### Distributed tuning
+
+* ray
+```bash
+pip install flaml[ray]
+```
+* nni
+```bash
+pip install flaml[nni]
+```
+* blendsearch
+```bash
+pip install flaml[blendsearch]
+```
+
+#### Test and Benchmark
+
+* test
+```bash
+pip install flaml[test]
+```
+* benchmark
+```bash
+pip install flaml[benchmark]
+```
+
+## .NET
+
+FLAML has a .NET implementation in [ML.NET](http://dot.net/ml), an open-source, cross-platform machine learning framework for .NET.
+
+You can use FLAML in .NET in the following ways:
+
+**Low-code**
+
+- [*Model Builder*](https://dotnet.microsoft.com/apps/machinelearning-ai/ml-dotnet/model-builder) - A Visual Studio extension for training ML models using FLAML. For more information on how to install the, see the [install Model Builder](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-model-builder?tabs=visual-studio-2022) guide.
+- [*ML.NET CLI*](https://docs.microsoft.com/dotnet/machine-learning/automate-training-with-cli) - A dotnet CLI tool for training machine learning models using FLAML on Windows, MacOS, and Linux. For more information on how to install the ML.NET CLI, see the [install the ML.NET CLI](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-ml-net-cli?tabs=windows) guide.
+
+**Code-first**
+
+- [*Microsoft.ML.AutoML*](https://www.nuget.org/packages/Microsoft.ML.AutoML/0.20.0-preview.22313.1) - NuGet package that provides direct access to the FLAML AutoML APIs that power low-code solutions like Model Builder and the ML.NET CLI. For more information on installing NuGet packages, see the install and use a NuGet package in [Visual Studio](https://docs.microsoft.com/nuget/quickstart/install-and-use-a-package-in-visual-studio) or [dotnet CLI](https://docs.microsoft.com/nuget/quickstart/install-and-use-a-package-using-the-dotnet-cli) guides.
+
+To get started with the ML.NET API and AutoML, see the [csharp-notebooks](https://github.com/dotnet/csharp-notebooks#machine-learning).
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Research.md b/website/versioned_docs/version-1.0.4/Research.md
new file mode 100644
index 0000000000..6ed880d6f3
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Research.md
@@ -0,0 +1,21 @@
+# Research
+
+For technical details, please check our research publications.
+
+* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.
+
+```bibtex
+@inproceedings{wang2021flaml,
+    title={FLAML: A Fast and Lightweight AutoML Library},
+    author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},
+    year={2021},
+    booktitle={MLSys},
+}
+```
+
+* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
+* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.
+* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.
+* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.
+* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).
+* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/Task-Oriented-AutoML.md b/website/versioned_docs/version-1.0.4/Use-Cases/Task-Oriented-AutoML.md
new file mode 100644
index 0000000000..94025e57e8
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Use-Cases/Task-Oriented-AutoML.md
@@ -0,0 +1,596 @@
+# Task Oriented AutoML
+
+## Overview
+
+[`flaml.AutoML`](../reference/automl/automl#automl-objects) is a class for task-oriented AutoML. It can be used as a scikit-learn style estimator with the standard `fit` and `predict` functions. The minimal inputs from users are the training data and the task type.
+
+* Training data:
+    - numpy array. When the input data are stored in numpy array, they are passed to `fit()` as `X_train` and `y_train`.
+    - pandas dataframe. When the input data are stored in pandas dataframe, they are passed to `fit()` either as `X_train` and `y_train`, or as `dataframe` and `label`.
+* Tasks (specified via `task`):
+    - 'classification': classification with tabular data.
+    - 'regression': regression with tabular data.
+    - 'ts_forecast': time series forecasting.
+    - 'ts_forecast_classification': time series forecasting for classification.
+    - 'ts_forecast_panel': time series forecasting for panel datasets (multiple time series).
+    - 'rank': learning to rank.
+    - 'seq-classification': sequence classification.
+    - 'seq-regression': sequence regression.
+    - 'summarization': text summarization.
+    - 'token-classification': token classification.
+    - 'multichoice-classification': multichoice classification.
+
+Two optional inputs are `time_budget` and `max_iter` for searching models and hyperparameters. When both are unspecified, only one model per estimator will be trained (using our [zero-shot](Zero-Shot-AutoML) technique). When `time_budget` is provided, there can be randomness in the result due to runtime variance.
+
+A typical way to use `flaml.AutoML`:
+
+```python
+# Prepare training data
+# ...
+from flaml import AutoML
+automl = AutoML()
+automl.fit(X_train, y_train, task="regression", time_budget=60, **other_settings)
+# Save the model
+with open("automl.pkl", "wb") as f:
+    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
+
+# At prediction time
+with open("automl.pkl", "rb") as f:
+    automl = pickle.load(f)
+pred = automl.predict(X_test)
+```
+
+If users provide the minimal inputs only, `AutoML` uses the default settings for optimization metric, estimator list etc.
+
+## Customize AutoML.fit()
+
+### Optimization metric
+
+The optimization metric is specified via the `metric` argument. It can be either a string which refers to a built-in metric, or a user-defined function.
+
+* Built-in metric.
+    - 'accuracy': 1 - accuracy as the corresponding metric to minimize.
+    - 'log_loss': default metric for multiclass classification.
+    - 'r2': 1 - r2_score as the corresponding metric to minimize. Default metric for regression.
+    - 'rmse': root mean squared error.
+    - 'mse': mean squared error.
+    - 'mae': mean absolute error.
+    - 'mape': mean absolute percentage error.
+    - 'roc_auc': minimize 1 - roc_auc_score. Default metric for binary classification.
+    - 'roc_auc_ovr': minimize 1 - roc_auc_score with `multi_class="ovr"`.
+    - 'roc_auc_ovo': minimize 1 - roc_auc_score with `multi_class="ovo"`.
+    - 'roc_auc_weighted': minimize 1 - roc_auc_score with `average="weighted"`.
+    - 'roc_auc_ovr_weighted': minimize 1 - roc_auc_score with `multi_class="ovr"` and `average="weighted"`.
+    - 'roc_auc_ovo_weighted': minimize 1 - roc_auc_score with `multi_class="ovo"` and `average="weighted"`.
+    - 'f1': minimize 1 - f1_score.
+    - 'micro_f1': minimize 1 - f1_score with `average="micro"`.
+    - 'macro_f1': minimize 1 - f1_score with `average="macro"`.
+    - 'ap': minimize 1 - average_precision_score.
+    - 'ndcg': minimize 1 - ndcg_score.
+    - 'ndcg@k': minimize 1 - ndcg_score@k. k is an integer.
+* User-defined function.
+A customized metric function that requires the following (input) signature, and returns the input config’s value in terms of the metric you want to minimize, and a dictionary of auxiliary information at your choice:
+
+```python
+def custom_metric(
+    X_val, y_val, estimator, labels,
+    X_train, y_train, weight_val=None, weight_train=None,
+    config=None, groups_val=None, groups_train=None,
+):
+    return metric_to_minimize, metrics_to_log
+```
+
+For example,
+```python
+def custom_metric(
+    X_val, y_val, estimator, labels,
+    X_train, y_train, weight_val=None, weight_train=None,
+    **args,
+):
+    from sklearn.metrics import log_loss
+    import time
+
+    start = time.time()
+    y_pred = estimator.predict_proba(X_val)
+    pred_time = (time.time() - start) / len(X_val)
+    val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+    y_pred = estimator.predict_proba(X_train)
+    train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+    alpha = 0.5
+    return val_loss * (1 + alpha) - alpha * train_loss, {
+        "val_loss": val_loss,
+        "train_loss": train_loss,
+        "pred_time": pred_time,
+    }
+```
+It returns the validation loss penalized by the gap between validation and training loss as the metric to minimize, and three metrics to log: val_loss, train_loss and pred_time. The arguments `config`, `groups_val` and `groups_train` are not used in the function.
+
+### Estimator and search space
+
+The estimator list can contain one or more estimator names, each corresponding to a built-in estimator or a custom estimator. Each estimator has a search space for hyperparameter configurations. FLAML supports both classical machine learning models and deep neural networks.
+
+#### Estimator
+* Built-in estimator.
+    - 'lgbm': LGBMEstimator for task "classification", "regression", "rank", "ts_forecast" and "ts_forecast_classification". Hyperparameters: n_estimators, num_leaves, min_child_samples, learning_rate, log_max_bin (logarithm of (max_bin + 1) with base 2), colsample_bytree, reg_alpha, reg_lambda.
+    - 'xgboost': XGBoostSkLearnEstimator for task "classification", "regression", "rank", "ts_forecast" and "ts_forecast_classification". Hyperparameters: n_estimators, max_leaves, min_child_weight, learning_rate, subsample, colsample_bylevel, colsample_bytree, reg_alpha, reg_lambda.
+    - 'xgb_limitdepth': XGBoostLimitDepthEstimator for task "classification", "regression", "rank", "ts_forecast" and "ts_forecast_classification". Hyperparameters: n_estimators,  max_depth, min_child_weight, learning_rate, subsample, colsample_bylevel, colsample_bytree, reg_alpha, reg_lambda.
+    - 'rf': RandomForestEstimator for task "classification", "regression", "ts_forecast" and "ts_forecast_classification". Hyperparameters: n_estimators, max_features, max_leaves, criterion (for classification only). Starting from v1.1.0,
+    it uses a fixed ranndom_state by default.
+    - 'extra_tree': ExtraTreesEstimator for task "classification", "regression", "ts_forecast" and "ts_forecast_classification". Hyperparameters: n_estimators, max_features, max_leaves, criterion (for classification only). Starting from v1.1.0,
+    it uses a fixed ranndom_state by default.
+    - 'lrl1': LRL1Classifier (sklearn.LogisticRegression with L1 regularization) for task "classification". Hyperparameters: C.
+    - 'lrl2': LRL2Classifier (sklearn.LogisticRegression with L2 regularization) for task "classification". Hyperparameters: C.
+    - 'catboost': CatBoostEstimator for task "classification" and "regression". Hyperparameters: early_stopping_rounds, learning_rate, n_estimators.
+    - 'kneighbor': KNeighborsEstimator for task "classification" and "regression". Hyperparameters: n_neighbors.
+    - 'prophet': Prophet for task "ts_forecast". Hyperparameters: changepoint_prior_scale, seasonality_prior_scale, holidays_prior_scale, seasonality_mode.
+    - 'arima': ARIMA for task "ts_forecast". Hyperparameters: p, d, q.
+    - 'sarimax': SARIMAX for task "ts_forecast". Hyperparameters: p, d, q, P, D, Q, s.
+    - 'transformer': Huggingface transformer models for task "seq-classification", "seq-regression", "multichoice-classification", "token-classification" and "summarization". Hyperparameters: learning_rate, num_train_epochs, per_device_train_batch_size, warmup_ratio, weight_decay, adam_epsilon, seed.
+    - 'temporal_fusion_transformer': TemporalFusionTransformerEstimator for task "ts_forecast_panel". Hyperparameters: gradient_clip_val, hidden_size, hidden_continuous_size, attention_head_size, dropout, learning_rate. There is a [known issue](https://github.com/jdb78/pytorch-forecasting/issues/1145) with pytorch-forecast logging.
+* Custom estimator. Use custom estimator for:
+    - tuning an estimator that is not built-in;
+    - customizing search space for a built-in estimator.
+
+#### Guidelines on tuning a custom estimator
+
+To tune a custom estimator that is not built-in, you need to:
+1. Build a custom estimator by inheritting [`flaml.model.BaseEstimator`](../reference/automl/model#baseestimator-objects) or a derived class.
+For example, if you have a estimator class with scikit-learn style `fit()` and `predict()` functions, you only need to set `self.estimator_class` to be that class in your constructor.
+
+```python
+from flaml.automl.model import SKLearnEstimator
+# SKLearnEstimator is derived from BaseEstimator
+import rgf
+
+
+class MyRegularizedGreedyForest(SKLearnEstimator):
+  def __init__(self, task="binary", **config):
+    super().__init__(task, **config)
+
+    if task in CLASSIFICATION:
+      from rgf.sklearn import RGFClassifier
+
+      self.estimator_class = RGFClassifier
+    else:
+      from rgf.sklearn import RGFRegressor
+
+      self.estimator_class = RGFRegressor
+
+  @classmethod
+  def search_space(cls, data_size, task):
+    space = {
+      "max_leaf": {
+        "domain": tune.lograndint(lower=4, upper=data_size),
+        "low_cost_init_value": 4,
+      },
+      "n_iter": {
+        "domain": tune.lograndint(lower=1, upper=data_size),
+        "low_cost_init_value": 1,
+      },
+      "learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
+      "min_samples_leaf": {
+        "domain": tune.lograndint(lower=1, upper=20),
+        "init_value": 20,
+      },
+    }
+    return space
+```
+
+In the constructor, we set `self.estimator_class` as `RGFClassifier` or `RGFRegressor` according to the task type. If the estimator you want to tune does not have a scikit-learn style `fit()` and `predict()` API, you can override the `fit()` and `predict()` function of `flaml.model.BaseEstimator`, like [XGBoostEstimator](../reference/automl/model#xgboostestimator-objects). Importantly, we also add the `task="binary"` parameter in the signature of `__init__` so that it doesn't get grouped together with the `**config` kwargs that determines the parameters with which the underlying estimator (`self.estimator_class`) is constructed. If your estimator doesn't use one of the parameters that it is passed, for example some regressors in `scikit-learn` don't use the `n_jobs` parameter, it is enough to add `n_jobs=None` to the signature so that it is ignored by the `**config` dict.
+
+2. Give the custom estimator a name and add it in AutoML. E.g.,
+
+```python
+from flaml import AutoML
+automl = AutoML()
+automl.add_learner("rgf", MyRegularizedGreedyForest)
+```
+
+This registers the `MyRegularizedGreedyForest` class in AutoML, with the name "rgf".
+
+3. Tune the newly added custom estimator in either of the following two ways depending on your needs:
+- tune rgf alone: `automl.fit(..., estimator_list=["rgf"])`; or
+- mix it with other built-in learners: `automl.fit(..., estimator_list=["rgf", "lgbm", "xgboost", "rf"])`.
+
+#### Search space
+
+Each estimator class, built-in or not, must have a `search_space` function. In the `search_space` function, we return a dictionary about the hyperparameters, the keys of which are the names of the hyperparameters to tune, and each value is a set of detailed search configurations about the corresponding hyperparameters represented in a dictionary. A search configuration dictionary includes the following fields:
+* `domain`, which specifies the possible values of the hyperparameter and their distribution. Please refer to [more details about the search space domain](Tune-User-Defined-Function#more-details-about-the-search-space-domain).
+* `init_value` (optional), which specifies the initial value of the hyperparameter.
+* `low_cost_init_value`(optional), which specifies the value of the hyperparameter that is associated with low computation cost. See [cost related hyperparameters](Tune-User-Defined-Function#cost-related-hyperparameters) or [FAQ](../FAQ#about-low_cost_partial_config-in-tune) for more details.
+
+In the example above, we tune four hyperparameters, three integers and one float. They all follow a log-uniform distribution. "max_leaf" and "n_iter" have "low_cost_init_value" specified as their values heavily influence the training cost.
+
+To customize the search space for a built-in estimator, use a similar approach to define a class that inherits the existing estimator. For example,
+
+```python
+from flaml.automl.model import XGBoostEstimator
+
+
+def logregobj(preds, dtrain):
+  labels = dtrain.get_label()
+  preds = 1.0 / (1.0 + np.exp(-preds))  # transform raw leaf weight
+  grad = preds - labels
+  hess = preds * (1.0 - preds)
+  return grad, hess
+
+
+class MyXGB1(XGBoostEstimator):
+  """XGBoostEstimator with logregobj as the objective function"""
+
+  def __init__(self, **config):
+    super().__init__(objective=logregobj, **config)
+```
+
+We override the constructor and set the training objective as a custom function `logregobj`. The hyperparameters and their search range do not change. For another example,
+
+```python
+class XGBoost2D(XGBoostSklearnEstimator):
+    @classmethod
+    def search_space(cls, data_size, task):
+        upper = min(32768, int(data_size))
+        return {
+            "n_estimators": {
+                "domain": tune.lograndint(lower=4, upper=upper),
+                "low_cost_init_value": 4,
+            },
+            "max_leaves": {
+                "domain": tune.lograndint(lower=4, upper=upper),
+                "low_cost_init_value": 4,
+            },
+        }
+```
+
+We override the `search_space` function to tune two hyperparameters only, "n_estimators" and "max_leaves". They are both random integers in the log space, ranging from 4 to data-dependent upper bound. The lower bound for each corresponds to low training cost, hence the "low_cost_init_value" for each is set to 4.
+
+##### A shortcut to override the search space
+
+One can use the `custom_hp` argument in [`AutoML.fit()`](../reference/automl/automl#fit) to override the search space for an existing estimator quickly. For example, if you would like to temporarily change the search range of "n_estimators" of xgboost, disable searching "max_leaves" in random forest, and add "subsample" in the search space of lightgbm, you can set:
+
+```python
+custom_hp = {
+    "xgboost": {
+        "n_estimators": {
+            "domain": tune.lograndint(lower=new_lower, upper=new_upper),
+            "low_cost_init_value": new_lower,
+        },
+    },
+    "rf": {
+        "max_leaves": {
+            "domain": None,  # disable search
+        },
+    },
+    "lgbm": {
+        "subsample": {
+            "domain": tune.uniform(lower=0.1, upper=1.0),
+            "init_value": 1.0,
+        },
+        "subsample_freq": {
+            "domain": 1,  # subsample_freq must > 0 to enable subsample
+        },
+    },
+}
+```
+
+### Constraint
+
+There are several types of constraints you can impose.
+
+1. Constraints on the AutoML process.
+
+- `time_budget`: constrains the wall-clock time (seconds) used by the AutoML process. We provide some tips on [how to set time budget](#how-to-set-time-budget).
+
+- `max_iter`: constrains the maximal number of models to try in the AutoML process.
+
+2. Constraints on the constructor arguments of the estimators.
+
+Some constraints on the estimator can be implemented via the custom learner. For example,
+
+```python
+class MonotonicXGBoostEstimator(XGBoostSklearnEstimator):
+    @classmethod
+    def search_space(**args):
+        space = super().search_space(**args)
+        space.update({"monotone_constraints": {"domain": "(1, -1)"}})
+        return space
+```
+
+It adds a monotonicity constraint to XGBoost. This approach can be used to set any constraint that is an argument in the underlying estimator's constructor.
+A shortcut to do this is to use the [`custom_hp`](#a-shortcut-to-override-the-search-space) argument:
+
+```python
+custom_hp = {
+    "xgboost": {
+        "monotone_constraints": {
+            "domain": "(1, -1)"  # fix the domain as a constant
+        }
+    }
+}
+```
+
+3. Constraints on the models tried in AutoML.
+
+Users can set constraints such as the maximal number of models to try, limit on training time and prediction time per model.
+* `train_time_limit`: training time in seconds.
+* `pred_time_limit`: prediction time per instance in seconds.
+
+For example,
+```python
+automl.fit(X_train, y_train, max_iter=100, train_time_limit=1, pred_time_limit=1e-3)
+```
+
+4. Constraints on the metrics of the ML model tried in AutoML.
+
+When users provide a [custom metric function](#optimization-metric), which returns a primary optimization metric and a dictionary of additional metrics (typically also about the model) to log, users can also specify constraints on one or more of the metrics in the dictionary of additional metrics.
+
+Users need to provide a list of such constraints in the following format:
+Each element in this list is a 3-tuple, which shall be expressed
+in the following format: the first element of the 3-tuple is the name of the
+metric, the second element is the inequality sign chosen from ">=" and "<=",
+and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.
+
+For example,
+```python
+metric_constraints = [("train_loss", "<=", 0.1), ("val_loss", "<=", 0.1)]
+automl.fit(X_train, y_train, max_iter=100, train_time_limit=1, metric_constraints=metric_constraints)
+```
+
+### Ensemble
+
+To use stacked ensemble after the model search, set `ensemble=True` or a dict. When `ensemble=True`, the final estimator and `passthrough` in the stacker will be automatically chosen. You can specify customized final estimator or passthrough option:
+* "final_estimator": an instance of the final estimator in the stacker.
+* "passthrough": True (default) or False, whether to pass the original features to the stacker.
+
+For example,
+```python
+automl.fit(
+    X_train, y_train, task="classification",
+    "ensemble": {
+        "final_estimator": LogisticRegression(),
+        "passthrough": False,
+    },
+)
+```
+
+### Resampling strategy
+
+By default, flaml decides the resampling automatically according to the data size and the time budget. If you would like to enforce a certain resampling strategy, you can set `eval_method` to be "holdout" or "cv" for holdout or cross-validation.
+
+For holdout, you can also set:
+* `split_ratio`: the fraction for validation data, 0.1 by default.
+* `X_val`, `y_val`: a separate validation dataset. When they are passed, the validation metrics will be computed against this given validation dataset. If they are not passed, then a validation dataset will be split from the training data and held out from training during the model search. After the model search, flaml will retrain the model with best configuration on the full training data.
+You can set`retrain_full` to be `False` to skip the final retraining or "budget" to ask flaml to do its best to retrain within the time budget.
+
+For cross validation, you can also set `n_splits` of the number of folds. By default it is 5.
+
+#### Data split method
+
+By default, flaml uses the following method to split the data:
+* stratified split for classification;
+* uniform split for regression;
+* time-based split for time series forecasting;
+* group-based split for learning to rank.
+
+The data split method for classification can be changed into uniform split by setting `split_type="uniform"`. The data are shuffled when `split_type in ("uniform", "stratified")`.
+For both classification and regression, time-based split can be enforced if the data are sorted by timestamps, by setting `split_type="time"`.
+
+When `eval_method="cv"`, `split_type` can also be set as a custom splitter. It needs to be an instance of a derived class of scikit-learn
+[KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+and have ``split`` and ``get_n_splits`` methods with the same signatures.  To disable shuffling, the splitter instance must contain the attribute `shuffle=False`.
+
+### Parallel tuning
+
+When you have parallel resources, you can either spend them in training and keep the model search sequential, or perform parallel search. Following scikit-learn, the parameter `n_jobs` specifies how many CPU cores to use for each training job. The number of parallel trials is specified via the parameter `n_concurrent_trials`. By default, `n_jobs=-1, n_concurrent_trials=1`. That is, all the CPU cores (in a single compute node) are used for training a single model and the search is sequential. When you have more resources than what each single training job needs, you can consider increasing `n_concurrent_trials`.
+
+To do parallel tuning, install the `ray` and `blendsearch` options:
+```bash
+pip install flaml[ray,blendsearch]
+```
+
+`ray` is used to manage the resources. For example,
+```python
+ray.init(num_cpus=16)
+```
+allocates 16 CPU cores. Then, when you run:
+```python
+automl.fit(X_train, y_train, n_jobs=4, n_concurrent_trials=4)
+```
+flaml will perform 4 trials in parallel, each consuming 4 CPU cores. The parallel tuning uses the [BlendSearch](Tune-User-Defined-Function##blendsearch-economical-hyperparameter-optimization-with-blended-search-strategy) algorithm.
+
+#### **Guidelines on parallel vs sequential tuning**
+
+**(1) Considerations on wall-clock time.**
+
+One common motivation for parallel tuning is to save wall-clock time. When sequential tuning and parallel tuning achieve a similar wall-clock time, sequential tuning should be preferred. This is a rule of thumb when the HPO algorithm is sequential by nature (e.g., Bayesian Optimization and FLAML's HPO algorithms CFO and BS). Sequential tuning allows the HPO algorithms to take advantage of the historical trial results. Then the question is **How to estimate the wall-clock-time needed by parallel tuning and sequential tuning**?
+
+You can use the following way to roughly estimate the wall-clock time in parallel tuning and sequential tuning: To finish $N$ trials of hyperparameter tuning, i.e., run $N$ hyperparameter configurations, the total wall-clock time needed is $N/k*(SingleTrialTime + Overhead)$, in which $SingleTrialTime$ is the trial time to evaluate a particular hyperparameter configuration, $k$ is the scale of parallelism, e.g., the number of parallel CPU/GPU cores, and $Overhead$ is the computation overhead.
+
+In sequential tuning, $k=1$, and in parallel tuning $k>1$. This may suggest that parallel tuning has a shorter wall-clock time. But it is not always the case considering the other two factors $SingleTrialTime$, and $Overhead$:
+
+- The $Overhead$ in sequential tuning is typically negligible; while in parallel tuning, it is relatively large.
+
+- You can also try to reduce the $SingleTrialTime$ to reduce the wall-clock time in sequential tuning: For example, by increasing the resource consumed by a single trial (distributed or multi-thread training), you can reduce $SingleTrialTime$. One concrete example is to use the `n_jobs` parameter that sets the number of threads the fitting process can use in many scikit-learn style algorithms.
+
+**(2) Considerations on randomness.**
+
+Potential reasons that cause randomness:
+1. Parallel tuning: In the case of parallel tuning, the order of trials' finishing time is no longer deterministic. This non-deterministic order, combined with sequential HPO algorithms, leads to a non-deterministic hyperparameter tuning trajectory.
+
+2. Distributed or multi-thread training: Distributed/multi-thread training may introduce randomness in model training, i.e., the trained model with the same hyperparameter may be different because of such randomness. This model-level randomness may be undesirable in some cases.
+
+### Warm start
+
+We can warm start the AutoML by providing starting points of hyperparameter configurstions for each estimator. For example, if you have run AutoML for one hour, after checking the results, you would like to run it for another two hours, then you can use the best configurations found for each estimator as the starting points for the new run.
+
+```python
+automl1 = AutoML()
+automl1.fit(X_train, y_train, time_budget=3600)
+automl2 = AutoML()
+automl2.fit(X_train, y_train, time_budget=7200, starting_points=automl1.best_config_per_estimator)
+```
+
+`starting_points` is a dictionary or a str to specify the starting hyperparameter config. (1) When it is a dictionary, the keys are the estimator names. If you do not need to specify starting points for an estimator, exclude its name from the dictionary. The value for each key can be either a dictionary of a list of dictionaries, corresponding to one hyperparameter configuration, or multiple hyperparameter configurations, respectively. (2) When it is a str: if "data", use data-dependent defaults; if "data:path", use data-dependent defaults which are stored at path; if "static", use data-independent defaults. Please find more details about data-dependent defaults in [zero shot AutoML](Zero-Shot-AutoML#combine-zero-shot-automl-and-hyperparameter-tuning).
+
+### Log the trials
+
+The trials are logged in a file if a `log_file_name` is passed.
+Each trial is logged as a json record in one line. The best trial's id is logged in the last line. For example,
+```
+{"record_id": 0, "iter_per_learner": 1, "logged_metric": null, "trial_time": 0.12717914581298828, "wall_clock_time": 0.1728971004486084, "validation_loss": 0.07333333333333332, "config": {"n_estimators": 4, "num_leaves": 4, "min_child_samples": 20, "learning_rate": 0.09999999999999995, "log_max_bin": 8, "colsample_bytree": 1.0, "reg_alpha": 0.0009765625, "reg_lambda": 1.0}, "learner": "lgbm", "sample_size": 150}
+{"record_id": 1, "iter_per_learner": 3, "logged_metric": null, "trial_time": 0.07027268409729004, "wall_clock_time": 0.3756711483001709, "validation_loss": 0.05333333333333332, "config": {"n_estimators": 4, "num_leaves": 4, "min_child_samples": 12, "learning_rate": 0.2677050123105203, "log_max_bin": 7, "colsample_bytree": 1.0, "reg_alpha": 0.001348364934537134, "reg_lambda": 1.4442580148221913}, "learner": "lgbm", "sample_size": 150}
+{"curr_best_record_id": 1}
+```
+
+1. `iter_per_learner` means how many models have been tried for each learner. The reason you see records like `iter_per_learner=3` for `record_id=1` is that flaml only logs better configs than the previous iters by default, i.e., `log_type='better'`. If you use `log_type='all'` instead, all the trials will be logged.
+1. `trial_time` means the time taken to train and evaluate one config in that trial. `total_search_time` is the total time spent from the beginning of `fit()`.
+1. flaml will adjust the `n_estimators` for lightgbm etc. according to the remaining budget and check the time budget constraint and stop in several places. Most of the time that makes `fit()` stops before the given budget. Occasionally it may run over the time budget slightly. But the log file always contains the best config info and you can recover the best model until any time point using `retrain_from_log()`.
+
+We can also use mlflow for logging:
+```python
+mlflow.set_experiment("flaml")
+with mlflow.start_run():
+    automl.fit(X_train=X_train, y_train=y_train, **settings)
+```
+
+### Extra fit arguments
+
+Extra fit arguments that are needed by the estimators can be passed to `AutoML.fit()`. For example, if there is a weight associated with each training example, they can be passed via `sample_weight`. For another example, `period` can be passed for time series forecaster. For any extra keywork argument passed to `AutoML.fit()` which has not been explicitly listed in the function signature, it will be passed to the underlying estimators' `fit()` as is. For another example, you can set the number of gpus used by each trial with the `gpu_per_trial` argument, which is only used by TransformersEstimator and XGBoostSklearnEstimator.
+
+In addition, you can specify the different arguments needed by different estimators using the `fit_kwargs_by_estimator` argument. For example, you can set the custom arguments for a Transformers model:
+
+```python
+from flaml.automl.data import load_openml_dataset
+from flaml import AutoML
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
+
+automl = AutoML()
+automl_settings = {
+    "task": "classification",
+    "time_budget": 10,
+    "estimator_list": ["catboost", "rf"],
+    "fit_kwargs_by_estimator": {
+        "catboost": {
+            "verbose": True,  # setting the verbosity of catboost to True
+        }
+    },
+}
+automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
+```
+
+## Retrieve and analyze the outcomes of AutoML.fit()
+
+### Get best model
+
+The best model can be obtained by the `model` property of an `AutoML` instance. For example,
+
+```python
+automl.fit(X_train, y_train, task="regression")
+print(automl.model)
+# <flaml.model.LGBMEstimator object at 0x7f9b502c4550>
+```
+
+[`flaml.model.LGBMEstimator`](../reference/automl/model#lgbmestimator-objects) is a wrapper class for LightGBM models. To access the underlying model, use the `estimator` property of the `flaml.model.LGBMEstimator` instance.
+
+```python
+print(automl.model.estimator)
+'''
+LGBMRegressor(colsample_bytree=0.7610534336273627,
+              learning_rate=0.41929025492645006, max_bin=255,
+              min_child_samples=4, n_estimators=45, num_leaves=4,
+              reg_alpha=0.0009765625, reg_lambda=0.009280655005879943,
+              verbose=-1)
+'''
+```
+
+Just like a normal LightGBM model, we can inspect it. For example, we can plot the feature importance:
+```python
+import matplotlib.pyplot as plt
+plt.barh(automl.model.estimator.feature_name_, automl.model.estimator.feature_importances_)
+```
+![png](images/feature_importance.png)
+
+### Get best configuration
+
+We can find the best estimator's name and best configuration by:
+
+```python
+print(automl.best_estimator)
+# lgbm
+print(automl.best_config)
+# {'n_estimators': 148, 'num_leaves': 18, 'min_child_samples': 3, 'learning_rate': 0.17402065726724145, 'log_max_bin': 8, 'colsample_bytree': 0.6649148062238498, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.0067613624509965}
+```
+
+We can also find the best configuration per estimator.
+
+```python
+print(automl.best_config_per_estimator)
+# {'lgbm': {'n_estimators': 148, 'num_leaves': 18, 'min_child_samples': 3, 'learning_rate': 0.17402065726724145, 'log_max_bin': 8, 'colsample_bytree': 0.6649148062238498, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.0067613624509965}, 'rf': None, 'catboost': None, 'xgboost': {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 1.8630223791106992, 'learning_rate': 1.0, 'subsample': 0.8513627344387318, 'colsample_bylevel': 1.0, 'colsample_bytree': 0.946138073111236, 'reg_alpha': 0.0018311776973217073, 'reg_lambda': 0.27901659190538414}, 'extra_tree': {'n_estimators': 4, 'max_features': 1.0, 'max_leaves': 4}}
+```
+
+The `None` value corresponds to the estimators which have not been tried.
+
+Other useful information:
+```python
+print(automl.best_config_train_time)
+# 0.24841618537902832
+print(automl.best_iteration)
+# 10
+print(automl.best_loss)
+# 0.15448622217577546
+print(automl.time_to_find_best_model)
+# 0.4167296886444092
+print(automl.config_history)
+# {0: ('lgbm', {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0}, 1.2300517559051514)}
+# Meaning: at iteration 0, the config tried is {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0} for lgbm, and the wallclock time is 1.23s when this trial is finished.
+```
+
+### Plot learning curve
+
+To plot how the loss is improved over time during the model search, first load the search history from the log file:
+
+```python
+from flaml.automl.data import get_output_from_log
+
+time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history =
+    get_output_from_log(filename=settings["log_file_name"], time_budget=120)
+```
+
+Then, assuming the optimization metric is "accuracy", we can plot the accuracy versus wallclock time:
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.title("Learning Curve")
+plt.xlabel("Wall Clock Time (s)")
+plt.ylabel("Validation Accuracy")
+plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
+plt.show()
+```
+
+![png](images/curve.png)
+
+The curve suggests that increasing the time budget may further improve the accuracy.
+
+### How to set time budget
+
+* If you have an exact constraint for the total search time, set it as the time budget.
+* If you have flexible time constraints, for example, your desirable time budget is t1=60s, and the longest time budget you can tolerate is t2=3600s, you can try the following two ways:
+1. set t1 as the time budget, and check the message in the console log in the end. If the budget is too small, you will see a warning like
+> WARNING - Time taken to find the best model is 91% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
+2. set t2 as the time budget, and also set `early_stop=True`. If the early stopping is triggered, you will see a warning like
+> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
+
+ > WARNING - Stopping search as early_stop is set to True.
+
+### How much time is needed to find the best model
+
+If you want to get a sense of how much time is needed to find the best model, you can use `max_iter=2` to perform two trials first. The message will be like:
+> INFO - iteration 0, current learner lgbm
+
+> INFO - Estimated sufficient time budget=145194s. Estimated necessary time budget=2118s.
+
+> INFO -  at 2.6s,  estimator lgbm's best error=0.4459,     best estimator lgbm's best error=0.4459
+
+You will see that the time to finish the first and cheapest trial is 2.6 seconds. The estimated necessary time budget is 2118 seconds, and the estimated sufficient time budget is 145194 seconds. Note that this is only an estimated range to help you decide your budget.
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/Tune-User-Defined-Function.md b/website/versioned_docs/version-1.0.4/Use-Cases/Tune-User-Defined-Function.md
new file mode 100644
index 0000000000..39f04eb863
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Use-Cases/Tune-User-Defined-Function.md
@@ -0,0 +1,640 @@
+# Tune User Defined Function
+
+[`flaml.tune`](../reference/tune/tune) is a module for economical hyperparameter tuning. It is used internally by `flaml.AutoML`. It can also be used to directly tune a user-defined function (UDF), which is not limited to machine learning model training. You can use `flaml.tune` instead of `flaml.AutoML` if one of the following is true:
+
+1. Your machine learning task is not one of the built-in tasks from `flaml.AutoML`.
+1. Your input cannot be represented as X_train + y_train or dataframe + label.
+1. The optimization metric is not measurable via validation data only. For example, when you want to directly optimize a downstream application instead of a model accuracy metric.
+1. You need to tune a function that may not even be a machine learning procedure.
+
+## Basic Tuning Procedure
+
+There are three essential steps (assuming the knowledge of the set of hyperparameters to tune) to use `flaml.tune` to finish a basic tuning task:
+1. Specify the [tuning objective](#tuning-objective) with respect to the hyperparameters.
+1. Specify a [search space](#search-space) of the hyperparameters.
+1. Specify [tuning constraints](#tuning-constraints), including constraints on the resource budget to do the tuning, constraints on the configurations, or/and constraints on a (or multiple) particular metric(s).
+
+With these steps, you can [perform a basic tuning task](#put-together) accordingly.
+
+### Tuning objective
+
+Related arguments:
+- `evaluation_function`: A user-defined evaluation function.
+- `metric`: A string of the metric name to optimize for.
+- `mode`:  A string in ['min', 'max'] to specify the objective as minimization or maximization.
+
+The first step is to specify your tuning objective.
+To do it, you should first specify your evaluation procedure (e.g., perform a machine learning model training and validation) with respect to the hyperparameters in a user-defined function `evaluation_function`.
+The function requires a hyperparameter configuration as input, and can simply return a metric value in a scalar or return a dictionary of metric name and metric value pairs.
+
+In the following code, we define an evaluation function with respect to two hyperparameters named `x` and `y` according to $obj := (x-85000)^2 - x/y$. Note that we use this toy example here for more accessible demonstration purposes. In real use cases, the evaluation function usually cannot be written in this closed form, but instead involves a black-box and expensive evaluation procedure.  Please check out [Tune HuggingFace](../Examples/Tune-HuggingFace), [Tune PyTorch](../Examples/Tune-PyTorch) and [Tune LightGBM](../Getting-Started#tune-user-defined-function) for real examples of tuning tasks.
+
+```python
+import time
+
+def evaluate_config(config: dict):
+    """evaluate a hyperparameter configuration"""
+    score = (config["x"] - 85000) ** 2 - config["x"] / config["y"]
+    # usually the evaluation takes an non-neglible cost
+    # and the cost could be related to certain hyperparameters
+    # here we simulate this cost by calling the time.sleep() function
+    # here we assume the cost is proportional to x
+    faked_evaluation_cost = config["x"] / 100000
+    time.sleep(faked_evaluation_cost)
+    # we can return a single float as a score on the input config:
+    # return score
+    # or, we can return a dictionary that maps metric name to metric value:
+    return {"score": score, "evaluation_cost": faked_evaluation_cost, "constraint_metric": config["x"] * config["y"]}
+```
+
+When the evaluation function returns a dictionary of metrics, you need to specify the name of the metric to optimize via the argument `metric` (this can be skipped when the function is just returning a scalar). In addition, you need to specify a mode of your optimization/tuning task (maximization or minimization) via the argument `mode` by choosing from "min" or "max".
+
+For example,
+
+```python
+flaml.tune.run(evaluation_function=evaluate_config, metric="score", mode="min", ...)
+```
+
+### Search space
+
+Related arguments:
+- `config`: A dictionary to specify the search space.
+- `low_cost_partial_config` (optional): A dictionary from a subset of controlled dimensions to the initial low-cost values.
+- `cat_hp_cost` (optional): A dictionary from a subset of categorical dimensions to the relative cost of each choice.
+
+The second step is to specify a search space of the hyperparameters through the argument `config`. In the search space, you need to specify valid values for your hyperparameters and can specify how these values are sampled (e.g., from a uniform distribution or a log-uniform distribution).
+
+In the following code example, we include a search space for the two hyperparameters `x` and `y` as introduced above. The valid values for both are integers in the range of [1, 100000]. The values for `x` are sampled uniformly in the specified range (using `tune.randint(lower=1, upper=100000)`), and the values for `y` are sampled uniformly in logarithmic space of the specified range (using `tune.lograndit(lower=1, upper=100000)`).
+
+
+```python
+from flaml import tune
+
+# construct a search space for the hyperparameters x and y.
+config_search_space = {
+    "x": tune.lograndint(lower=1, upper=100000),
+    "y": tune.randint(lower=1, upper=100000)
+}
+
+# provide the search space to tune.run
+tune.run(..., config=config_search_space, ...)
+```
+
+#### **Details and guidelines on hyperparameter search space**
+The corresponding value of a particular hyperparameter in the search space dictionary is called a *domain*, for example, `tune.randint(lower=1, upper=100000)` is the domain for the hyperparameter `y`.
+The domain specifies a *type* and *valid range* to sample parameters from. Supported types include float, integer, and categorical.
+
+- **Categorical hyperparameter**
+
+ If it is a categorical hyperparameter, then you should use `tune.choice(possible_choices)` in which `possible_choices` is the list of possible categorical values of the hyperparameter. For example, if you are tuning the optimizer used in model training, and the candidate optimizers are "sgd" and "adam", you should specify the search space in the following way:
+```python
+{
+    "optimizer": tune.choice(["sgd", "adam"]),
+}
+```
+- **Numerical hyperparameter**
+
+If it is a numerical hyperparameter, you need to know whether it takes integer values or float values. In addition, you need to know:
+- The range of valid values, i.e., what are the lower limit and upper limit of the hyperparameter value?
+- Do you want to sample in linear scale or log scale? It is a common practice to sample in the log scale if the valid value range is large and the evaluation function changes more regularly with respect to the log domain, as shown in the following example for learning rate tuning. In this code example, we set the lower limit and the upper limit of the learning rate to be 1/1024 and 1.0, respectively. We sample in the log space because model performance changes more regularly in the log scale with respect to the learning rate within such a large search range.
+
+```python
+{
+"learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
+}
+```
+When the search range of learning rate is small, it is more common to sample in the linear scale as shown in the following example,
+
+```python
+{
+"learning_rate": tune.uniform(lower=0.1, upper=0.2),
+}
+```
+
+
+- Do you have quantization granularity requirements?
+
+When you have a desired quantization granularity for the hyperparameter change, you can use `tune.qlograndint` or `tune.qloguniform` to realize the quantization requirement. The following code example helps you realize the need for sampling uniformly in the range of 0.1 and 0.2 with increments of 0.02, i.e., the sampled learning rate can only take values in {0.1, 0.12, 0.14, 0.16, ..., 0.2},
+```python
+{
+"learning_rate": tune.uniform(lower=0.1, upper=0.2, q=0.02),
+}
+```
+
+You can find the corresponding search space choice in the table below once you have answers to the aforementioned three questions.
+
+
+|      | Integer | Float |
+| ----------- | ----------- |-----------
+| linear scale      | tune.randint(lower: int, upper: int)| tune.uniform(lower: float, upper: float)|
+| log scale      | tune.lograndint(lower: int, upper: int, base: float = 10 | tune.loguniform(lower: float, upper: float, base: float = 10)|
+| linear scale with quantization| tune.qrandint(lower: int, upper: int, q: int = 1)| tune.quniform(lower: float, upper: float, q: float = 1)|
+log scale with quantization  | tune.qlograndint(lower: int, upper, q: int = 1, base: float = 10)| tune.qloguniform(lower: float, upper, q: float = 1, base: float = 10)
+
+
+See the example below for the commonly used types of domains.
+
+```python
+config = {
+    # Sample a float uniformly between -5.0 and -1.0
+    "uniform": tune.uniform(-5, -1),
+
+    # Sample a float uniformly between 3.2 and 5.4,
+    # rounding to increments of 0.2
+    "quniform": tune.quniform(3.2, 5.4, 0.2),
+
+    # Sample a float uniformly between 0.0001 and 0.01, while
+    # sampling in log space
+    "loguniform": tune.loguniform(1e-4, 1e-2),
+
+    # Sample a float uniformly between 0.0001 and 0.1, while
+    # sampling in log space and rounding to increments of 0.00005
+    "qloguniform": tune.qloguniform(1e-4, 1e-1, 5e-5),
+
+    # Sample a random float from a normal distribution with
+    # mean=10 and sd=2
+    "randn": tune.randn(10, 2),
+
+    # Sample a random float from a normal distribution with
+    # mean=10 and sd=2, rounding to increments of 0.2
+    "qrandn": tune.qrandn(10, 2, 0.2),
+
+    # Sample a integer uniformly between -9 (inclusive) and 15 (exclusive)
+    "randint": tune.randint(-9, 15),
+
+    # Sample a random uniformly between -21 (inclusive) and 12 (inclusive (!))
+    # rounding to increments of 3 (includes 12)
+    "qrandint": tune.qrandint(-21, 12, 3),
+
+    # Sample a integer uniformly between 1 (inclusive) and 10 (exclusive),
+    # while sampling in log space
+    "lograndint": tune.lograndint(1, 10),
+
+    # Sample a integer uniformly between 2 (inclusive) and 10 (inclusive (!)),
+    # while sampling in log space and rounding to increments of 2
+    "qlograndint": tune.qlograndint(2, 10, 2),
+
+    # Sample an option uniformly from the specified choices
+    "choice": tune.choice(["a", "b", "c"]),
+}
+```
+<!-- Please refer to [ray.tune](https://docs.ray.io/en/latest/tune/api_docs/search_space.html#overview) for a more comprehensive introduction about possible choices of the domain. -->
+
+
+#### Cost-related hyperparameters
+
+Cost-related hyperparameters are a subset of the hyperparameters which directly affect the computation cost incurred in the evaluation of any hyperparameter configuration. For example, the number of estimators (`n_estimators`) and the maximum number of leaves (`max_leaves`) are known to affect the training cost of tree-based learners. So they are cost-related hyperparameters for tree-based learners.
+
+When cost-related hyperparameters exist, the evaluation cost in the search space is heterogeneous.
+In this case, designing a search space with proper ranges of the hyperparameter values is highly non-trivial. Classical tuning algorithms such as Bayesian optimization and random search are typically sensitive to such ranges.  It may take them a very high cost to find a good choice if the ranges are too large. And if the ranges are too small, the optimal choice(s) may not be included and thus not possible to be found. With our method, you can use a search space with larger ranges in the case of heterogeneous cost.
+
+Our search algorithms are designed to finish the tuning process at a low total cost when the evaluation cost in the search space is heterogeneous.
+So in such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are encouraged to set them as the `low_cost_partial_config`, which is a dictionary of a subset of the hyperparameter coordinates whose value corresponds to a configuration with known low cost.  Using the example of the tree-based methods again, since we know that small `n_estimators` and `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that 4 is the lower bound of search space for these two hyperparameters), e.g., in LGBM. Please find more details on how the algorithm works [here](#cfo-frugal-optimization-for-cost-related-hyperparameters).
+
+
+In addition, if you are aware of the cost relationship between different categorical hyperparameter choices, you are encouraged to provide this information through `cat_hp_cost`. It also helps the search algorithm to reduce the total cost.
+
+### Tuning constraints
+
+Related arguments:
+- `time_budget_s`: The time budget in seconds.
+- `num_samples`: An integer of the number of configs to try.
+- `config_constraints` (optional): A list of config constraints to be satisfied.
+- `metric_constraints` (optional): A list of metric constraints to be satisfied. e.g., `['precision', '>=', 0.9]`.
+
+The third step is to specify constraints of the tuning task. One notable property of `flaml.tune` is that it is able to finish the tuning process (obtaining good results) within a required resource constraint. A user can either provide the resource constraint in terms of wall-clock time (in seconds) through the argument `time_budget_s`, or in terms of the number of trials through the argument `num_samples`.  The following example shows three use cases:
+
+```python
+# Set a resource constraint of 60 seconds wall-clock time for the tuning.
+flaml.tune.run(..., time_budget_s=60, ...)
+
+# Set a resource constraint of 100 trials for the tuning.
+flaml.tune.run(..., num_samples=100, ...)
+
+# Use at most 60 seconds and at most 100 trials for the tuning.
+flaml.tune.run(..., time_budget_s=60, num_samples=100, ...)
+```
+
+
+Optionally, you can provide a list of config constraints to be satisfied through the argument `config_constraints` and provide a list of metric constraints to be satisfied through the argument `metric_constraints`. We provide more details about related use cases in the [Advanced Tuning Options](#more-constraints-on-the-tuning) section.
+
+
+### Put together
+After the aforementioned key steps, one is ready to perform a tuning task by calling [`flaml.tune.run()`](../reference/tune/tune#run). Below is a quick sequential tuning example using the pre-defined search space `config_search_space` and a minimization (`mode='min'`) objective for the `score` metric evaluated in `evaluate_config`, using the default serach algorithm in flaml. The time budget is 10 seconds (`time_budget_s=10`).
+```python
+# require: pip install flaml[blendsearch]
+analysis = tune.run(
+    evaluate_config,  # the function to evaluate a config
+    config=config_search_space,  # the search space defined
+    metric="score",
+    mode="min",  # the optimization mode, "min" or "max"
+    num_samples=-1,  # the maximal number of configs to try, -1 means infinite
+    time_budget_s=10,  # the time budget in seconds
+)
+```
+
+
+### Result analysis
+
+Once the tuning process finishes, it returns an [ExperimentAnalysis](../reference/tune/analysis) object, which provides methods to analyze the tuning.
+
+In the following code example, we retrieve the best configuration found during the tuning, and retrieve the best trial's result from the returned `analysis`.
+
+```python
+analysis = tune.run(
+    evaluate_config,  # the function to evaluate a config
+    config=config_search_space,  # the search space defined
+    metric="score",
+    mode="min",  # the optimization mode, "min" or "max"
+    num_samples=-1,  # the maximal number of configs to try, -1 means infinite
+    time_budget_s=10,  # the time budget in seconds
+)
+print(analysis.best_config)  # the best config
+print(analysis.best_trial.last_result)  # the best trial's result
+```
+
+## Advanced Tuning Options
+
+There are several advanced tuning options worth mentioning.
+
+### More constraints on the tuning
+
+A user can specify constraints on the configurations to be satisfied via the argument `config_constraints`. The `config_constraints` receives a list of such constraints to be satisfied. Specifically, each constraint is a tuple that consists of (1) a function that takes a configuration as input and returns a numerical value; (2) an operation chosen from "<=", ">=", "<" or ">"; (3) a numerical threshold.
+
+In the following code example, we constrain the output of `area`, which takes a configuration as input and outputs a numerical value, to be no larger than 1000.
+
+```python
+def my_model_size(config):
+    return config["n_estimators"] * config["max_leaves"]
+
+analysis = tune.run(...,
+    config_constraints = [(my_model_size, "<=", 40)],
+)
+```
+
+ You can also specify a list of metric constraints to be satisfied via the argument `metric_constraints`. Each element in the `metric_constraints` list is a tuple that consists of (1) a string specifying the name of the metric (the metric name must be defined and returned in the user-defined `evaluation_function`); (2) an operation chosen from "<=" or ">="; (3) a numerical threshold.
+
+ In the following code example, we constrain the metric `training_cost` to be no larger than 1 second.
+
+```python
+analysis = tune.run(...,
+    metric_constraints = [("training_cost", "<=", 1)]),
+```
+
+#### **`config_constraints` vs `metric_constraints`:**
+The key difference between these two types of constraints is that the calculation of constraints in `config_constraints` does not rely on the computation procedure in the evaluation function, i.e., in `evaluation_function`. For example, when a constraint only depends on the config itself, as shown in the code example. Due to this independency, constraints in `config_constraints` will be checked before evaluation. So configurations that do not satisfy `config_constraints` will not be evaluated.
+
+
+### Parallel tuning
+
+Related arguments:
+
+- `use_ray`: A boolean of whether to use ray as the backend.
+- `resources_per_trial`: A dictionary of the hardware resources to allocate per trial, e.g., `{'cpu': 1}`. Only valid when using ray backend.
+
+
+You can perform parallel tuning by specifying `use_ray=True` (requiring flaml[ray] option installed). You can also limit the amount of resources allocated per trial by specifying `resources_per_trial`, e.g., `resources_per_trial={'cpu': 2}`.
+
+```python
+# require: pip install flaml[ray]
+analysis = tune.run(
+    evaluate_config,  # the function to evaluate a config
+    config=config_search_space,  # the search space defined
+    metric="score",
+    mode="min",  # the optimization mode, "min" or "max"
+    num_samples=-1,  # the maximal number of configs to try, -1 means infinite
+    time_budget_s=10,  # the time budget in seconds
+    use_ray=True,
+    resources_per_trial={"cpu": 2}  # limit resources allocated per trial
+)
+print(analysis.best_trial.last_result)  # the best trial's result
+print(analysis.best_config)  # the best config
+```
+
+**A headsup about computation overhead.** When parallel tuning is used, there will be a certain amount of computation overhead in each trial. In case each trial's original cost is much smaller than the overhead, parallel tuning can underperform sequential tuning. Sequential tuning is recommended when compute resource is limited, and each trial can consume all the resources.
+
+
+### Trial scheduling
+
+Related arguments:
+- `scheduler`: A scheduler for executing the trials.
+- `resource_attr`: A string to specify the resource dimension used by the scheduler.
+- `min_resource`: A float of the minimal resource to use for the resource_attr.
+- `max_resource`: A float of the maximal resource to use for the resource_attr.
+- `reduction_factor`: A float of the reduction factor used for incremental pruning.
+
+A scheduler can help manage the trials' execution. It can be used to perform multi-fiedlity evalution, or/and early stopping. You can use two different types of schedulers in `flaml.tune` via `scheduler`.
+
+#### 1. An authentic scheduler implemented in FLAML (`scheduler='flaml'`).
+
+This scheduler is authentic to the new search algorithms provided by FLAML. In a nutshell, it starts the search with the minimum resource. It switches between HPO with the current resource and increasing the resource for evaluation depending on which leads to faster improvement.
+
+If this scheduler is used, you need to
+- Specify a resource dimension. Conceptually a 'resource dimension' is a factor that affects the cost of the evaluation (e.g., sample size, the number of epochs). You need to specify the name of the resource dimension via `resource_attr`. For example, if `resource_attr="sample_size"`, then the config dict passed to the `evaluation_function` would contain a key "sample_size" and its value suggested by the search algorithm. That value should be used in the evaluation function to control the compute cost. The larger is the value, the more expensive the evaluation is.
+
+- Provide the lower and upper limit of the resource dimension via `min_resource` and `max_resource`, and optionally provide `reduction_factor`, which determines the magnitude of resource (multiplicative) increase when we decide to increase the resource.
+
+In the following code example, we consider the sample size as the resource dimension. It determines how much data is used to perform training as reflected in the `evaluation_function`. We set the `min_resource` and `max_resource` to 1000 and the size of the full training dataset, respectively.
+
+```python
+from flaml import tune
+from functools import partial
+from flaml.automl.data import load_openml_task
+
+
+def obj_from_resource_attr(resource_attr, X_train, X_test, y_train, y_test, config):
+    from lightgbm import LGBMClassifier
+    from sklearn.metrics import accuracy_score
+
+    # in this example sample size is our resource dimension
+    resource = int(config[resource_attr])
+    sampled_X_train = X_train.iloc[:resource]
+    sampled_y_train = y_train[:resource]
+
+    # construct a LGBM model from the config
+    # note that you need to first remove the resource_attr field
+    # from the config as it is not part of the original search space
+    model_config = config.copy()
+    del model_config[resource_attr]
+    model = LGBMClassifier(**model_config)
+
+    model.fit(sampled_X_train, sampled_y_train)
+    y_test_predict = model.predict(X_test)
+    test_loss = 1.0 - accuracy_score(y_test, y_test_predict)
+    return {resource_attr: resource, "loss": test_loss}
+
+
+X_train, X_test, y_train, y_test = load_openml_task(task_id=7592, data_dir="test/")
+max_resource = len(y_train)
+resource_attr = "sample_size"
+min_resource = 1000
+analysis = tune.run(
+    partial(obj_from_resource_attr, resource_attr, X_train, X_test, y_train, y_test),
+    config={
+        "n_estimators": tune.lograndint(lower=4, upper=32768),
+        "max_leaves": tune.lograndint(lower=4, upper=32768),
+        "learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
+    },
+    metric="loss",
+    mode="min",
+    resource_attr=resource_attr,
+    scheduler="flaml",
+    max_resource=max_resource,
+    min_resource=min_resource,
+    reduction_factor=2,
+    time_budget_s=10,
+    num_samples=-1,
+)
+```
+
+You can find more details about this scheduler in [this paper](https://arxiv.org/pdf/1911.04706.pdf).
+
+
+
+#### 2. A scheduler of the  [`TrialScheduler`](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-schedulers) class from `ray.tune`.
+
+There is a handful of schedulers of this type implemented in `ray.tune`, for example, [ASHA](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler), [HyperBand](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-original-hyperband), [BOHB](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-bohb), etc.
+
+To use this type of scheduler you can either (1) set `scheduler='asha'`, which will automatically create an  [ASHAScheduler](https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#asha-tune-schedulers-ashascheduler) instance using the provided inputs (`resource_attr`, `min_resource`, `max_resource`, and `reduction_factor`); or (2) create an instance by yourself and provided it via `scheduler`, as shown in the following code example,
+
+```python
+#  require: pip install flaml[ray]
+from ray.tune.schedulers import HyperBandScheduler
+my_scheduler = HyperBandScheduler(time_attr="sample_size", max_t=max_resource, reduction_factor=2)
+tune.run(.., scheduler=my_scheduler, ...)
+```
+- Similar to the case where the `flaml` scheduler is used, you need to specify the resource dimension, use the resource dimension accordingly in your `evaluation_function`, and provide the necessary information needed for scheduling, such as `min_resource`, `max_resource` and `reduction_factor` (depending on the requirements of the specific scheduler).
+
+- Different from the case when the `flaml` scheduler is used, the amount of resources to use at each iteration is not suggested by the search algorithm through the `resource_attr` in a configuration. You need to specify the evaluation schedule explicitly by yourself in the `evaluation_function` and **report intermediate results (using `tune.report()`) accordingly**. In the following code example, we use the ASHA scheduler by setting `scheduler="asha"`. We specify `resource_attr`, `min_resource`, `min_resource` and `reduction_factor` the same way as in the previous example (when "flaml" is used as the scheduler). We perform the evaluation in a customized schedule.
+
+- Use ray backend or not? You can choose to use ray backend or not by specifying `use_ray=True` or `use_ray=False`. When ray backend is not used, i.e., `use_ray=False`, you also need to stop the evaluation function by explicitly catching the `StopIteration` exception, as shown in the end of the evaluation function `obj_w_intermediate_report()` in the following code example.
+
+```python
+def obj_w_intermediate_report(resource_attr, X_train, X_test, y_train, y_test, min_resource, max_resource, config):
+    from lightgbm import LGBMClassifier
+    from sklearn.metrics import accuracy_score
+
+    # a customized schedule to perform the evaluation
+    eval_schedule = [res for res in range(min_resource, max_resource, 5000)] + [max_resource]
+    for resource in eval_schedule:
+        sampled_X_train = X_train.iloc[:resource]
+        sampled_y_train = y_train[:resource]
+
+        # construct a LGBM model from the config
+        model = LGBMClassifier(**config)
+
+        model.fit(sampled_X_train, sampled_y_train)
+        y_test_predict = model.predict(X_test)
+        test_loss = 1.0 - accuracy_score(y_test, y_test_predict)
+        # need to report the resource attribute used and the corresponding intermediate results
+        try:
+            tune.report(sample_size=resource, loss=test_loss)
+        except (StopIteration, SystemExit):
+            # do cleanup operation here
+            return
+
+resource_attr = "sample_size"
+min_resource = 1000
+max_resource = len(y_train)
+analysis = tune.run(
+    partial(obj_w_intermediate_report, resource_attr, X_train, X_test, y_train, y_test, min_resource, max_resource),
+    config={
+        "n_estimators": tune.lograndint(lower=4, upper=32768),
+        "learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
+    },
+    metric="loss",
+    mode="min",
+    resource_attr=resource_attr,
+    scheduler="asha",
+    max_resource=max_resource,
+    min_resource=min_resource,
+    reduction_factor=2,
+    time_budget_s=10,
+    num_samples = -1,
+)
+```
+
+- If you would like to do some cleanup opearation when the trial is stopped
+by the scheduler, you can do it when you catch the `StopIteration` (when not using ray) or `SystemExit` (when using ray) exception explicitly.
+
+### Warm start
+
+Related arguments:
+
+- `points_to_evaluate`: A list of initial hyperparameter configurations to run first.
+- `evaluated_rewards`: If you have previously evaluated the parameters passed in as `points_to_evaluate` , you can avoid re-running those trials by passing in the reward attributes as a list so the optimizer can be told the results without needing to re-compute the trial. Must be the same length or shorter length than `points_to_evaluate`.
+
+If you are aware of some good hyperparameter configurations, you are encouraged to provide them via `points_to_evaluate`. The search algorithm will try them first and use them to bootstrap the search.
+
+You can use previously evaluated configurations to warm-start your tuning.
+For example, the following code means that you know the reward for the two configs in
+points_to_evaluate are 3.99 and 1.99, respectively, and want to
+inform `tune.run()`.
+
+```python
+def simple_obj(config):
+    return config["a"] + config["b"]
+
+from flaml import tune
+config_search_space = {
+    "a": tune.uniform(lower=0, upper=0.99),
+    "b": tune.uniform(lower=0, upper=3)
+}
+
+points_to_evaluate = [
+    {"b": .99, "a": 3},
+    {"b": .99, "a": 2},
+    {"b": .80, "a": 3},
+    {"b": .80, "a": 2},
+]
+evaluated_rewards = [3.99, 2.99]
+
+analysis = tune.run(
+    simple_obj,
+    config=config_search_space,
+    mode="max",
+    points_to_evaluate=points_to_evaluate,
+    evaluated_rewards=evaluated_rewards,
+    time_budget_s=10,
+    num_samples=-1,
+)
+```
+
+### Reproducibility
+
+By default, there is randomness in our tuning process (for versions <= 0.9.1). If reproducibility is desired, you could manually set a random seed before calling `tune.run()`. For example, in the following code, we call `np.random.seed(100)` to set the random seed.
+With this random seed, running the following code multiple times will generate exactly the same search trajectory. The reproducibility can only be guaranteed in sequential tuning.
+
+```python
+import numpy as np
+np.random.seed(100)  # This line is not needed starting from version v0.9.2.
+analysis = tune.run(
+    simple_obj,
+    config=config_search_space,
+    mode="max",
+    num_samples=10,
+)
+```
+
+### Lexicographic Objectives
+We support tuning multiple objectives with lexicographic preference by providing argument `lexico_objectives` for `tune.run()`.
+`lexico_objectives` is a dictionary that contains the following fields of key-value pairs:
+ - `metrics`: a list of optimization objectives with the orders reflecting the priorities/preferences of the objectives.
+ - `modes`: (optional) a list of optimization modes (each mode either "min" or "max") corresponding to the objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
+ - `tolerances`: (optional) a dictionary to specify the optimality tolerances on objectives. The keys are the metric names (provided in "metrics"), and the values are the numerical tolerances values.
+ - `targets`: (optional) a dictionary to specify the optimization targets on the objectives. The keys are the metric names (provided in "metric"), and the values are the numerical target values.
+
+In the following example, we want to minimize `val_loss` and `pred_time` of the model where `val_loss` has high priority. The tolerances for `val_loss` and `pre_time` are 0.02 and 0 respectively. We do not set targets for these two objectives and we set them to -inf for both objectives.
+
+```python
+lexico_objectives = {}
+lexico_objectives["metrics"] = ["val_loss", "pred_time"]
+lexico_objectives["pred_time"] = ["min", "min"]
+lexico_objectives["tolerances"] = {"val_loss": 0.02, "pred_time": 0.0}
+lexico_objectives["targets"] = {"val_loss": -float('inf'), "pred_time": -float('inf')}
+
+# provide the lexico_objectives to tune.run
+tune.run(..., search_alg=None, lexico_objectives=lexico_objectives)
+```
+NOTE:
+
+1. When lexico_objectives is not None, the arguments metric, mode, will be invalid, and flaml's tune uses CFO as the `search_alg`, which makes the input (if provided) `search_alg` invalid.
+
+2. This is a new feature that will be released in version 1.1.0 and is subject to change in the future version.
+
+## Hyperparameter Optimization Algorithm
+
+To tune the hyperparameters toward your objective, you will want to use a hyperparameter optimization algorithm which can help suggest hyperparameters with better performance (regarding your objective). `flaml` offers two HPO methods: CFO and BlendSearch. `flaml.tune` uses BlendSearch by default when the option [blendsearch] is installed.
+
+<!-- ![png](images/CFO.png) | ![png](images/BlendSearch.png)
+:---:|:---: -->
+
+### CFO: Frugal Optimization for Cost-related Hyperparameters
+
+CFO uses the randomized direct search method FLOW<sup>2</sup> with adaptive stepsize and random restart.
+It requires a low-cost initial point as input if such point exists.
+The search begins with the low-cost initial point and gradually move to
+high cost region if needed. The local search method has a provable convergence
+rate and bounded cost.
+
+About FLOW<sup>2</sup>: FLOW<sup>2</sup> is a simple yet effective randomized direct search method.
+It is an iterative optimization method that can optimize for black-box functions.
+FLOW<sup>2</sup> only requires pairwise comparisons between function values to perform iterative update. Comparing to existing HPO methods, FLOW<sup>2</sup> has the following appealing properties:
+
+1. It is applicable to general black-box functions with a good convergence rate in terms of loss.
+1. It provides theoretical guarantees on the total evaluation cost incurred.
+
+The GIFs attached below demonstrate an example search trajectory of FLOW<sup>2</sup> shown in the loss and evaluation cost (i.e., the training time ) space respectively. FLOW<sup>2</sup> is used in tuning the # of leaves and the # of trees for XGBoost. The two background heatmaps show the loss and cost distribution of all configurations. The black dots are the points evaluated in FLOW<sup>2</sup>. Black dots connected by lines are points that yield better loss performance when evaluated.
+
+![gif](images/heatmap_loss_cfo_12s.gif) | ![gif](images/heatmap_cost_cfo_12s.gif)
+:---:|:---:
+
+From the demonstration, we can see that (1) FLOW<sup>2</sup> can quickly move toward the low-loss region, showing good convergence property and (2) FLOW<sup>2</sup> tends to avoid exploring the high-cost region until necessary.
+
+Example:
+
+```python
+from flaml import CFO
+tune.run(...
+    search_alg=CFO(low_cost_partial_config=low_cost_partial_config),
+)
+```
+
+**Recommended scenario**: There exist cost-related hyperparameters and a low-cost
+initial point is known before optimization.
+If the search space is complex and CFO gets trapped into local optima, consider
+using BlendSearch.
+
+### BlendSearch: Economical Hyperparameter Optimization With Blended Search Strategy
+
+BlendSearch combines local search with global search. It leverages the frugality
+of CFO and the space exploration ability of global search methods such as
+Bayesian optimization. Like CFO, BlendSearch requires a low-cost initial point
+as input if such point exists, and starts the search from there. Different from
+CFO, BlendSearch will not wait for the local search to fully converge before
+trying new start points. The new start points are suggested by the global search
+method and filtered based on their distance to the existing points in the
+cost-related dimensions. BlendSearch still gradually increases the trial cost.
+It prioritizes among the global search thread and multiple local search threads
+based on optimism in face of uncertainty.
+
+Example:
+
+```python
+# require: pip install flaml[blendsearch]
+from flaml import BlendSearch
+tune.run(...
+    search_alg=BlendSearch(low_cost_partial_config=low_cost_partial_config),
+)
+```
+
+**Recommended scenario**: Cost-related hyperparameters exist, a low-cost
+initial point is known, and the search space is complex such that local search
+is prone to be stuck at local optima.
+
+**Suggestion about using larger search space in BlendSearch**.
+In hyperparameter optimization, a larger search space is desirable because it is more likely to include the optimal configuration (or one of the optimal configurations) in hindsight. However the performance (especially anytime performance) of most existing HPO methods is undesirable if the cost of the configurations in the search space has a large variation. Thus hand-crafted small search spaces (with relatively homogeneous cost) are often used in practice for these methods, which is subject to idiosyncrasy. BlendSearch combines the benefits of local search and global search, which enables a smart (economical) way of deciding where to explore in the search space even though it is larger than necessary. This allows users to specify a larger search space in BlendSearch, which is often easier and a better practice than narrowing down the search space by hand.
+
+For more technical details, please check our papers.
+
+* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.
+
+```bibtex
+@inproceedings{wu2021cfo,
+    title={Frugal Optimization for Cost-related Hyperparameters},
+    author={Qingyun Wu and Chi Wang and Silu Huang},
+    year={2021},
+    booktitle={AAAI'21},
+}
+```
+
+* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.
+
+```bibtex
+@inproceedings{wang2021blendsearch,
+    title={Economical Hyperparameter Optimization With Blended Search Strategy},
+    author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},
+    year={2021},
+    booktitle={ICLR'21},
+}
+```
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/Zero-Shot-AutoML.md b/website/versioned_docs/version-1.0.4/Use-Cases/Zero-Shot-AutoML.md
new file mode 100644
index 0000000000..bbda225660
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/Use-Cases/Zero-Shot-AutoML.md
@@ -0,0 +1,250 @@
+# Zero Shot AutoML
+
+`flaml.default` is a package for zero-shot AutoML, or "no-tuning" AutoML. It uses [`flaml.AutoML`](../reference/automl/automl#automl-objects) and [`flaml.default.portfolio`](../reference/default/portfolio) to mine good hyperparameter configurations across different datasets offline, and recommend data-dependent default configurations at runtime without expensive tuning.
+
+Zero-shot AutoML has several benefits:
+* The computation cost is just training one model. No tuning is involved.
+* The decision of hyperparameter configuration is instant. No overhead to worry about.
+* Your code remains the same. No breaking of the existing workflow.
+* It requires less input from the user. No need to specify a tuning budget etc.
+* All training data are used for, guess what, training. No need to worry about holding a subset of training data for validation (and overfitting the validation data).
+* The offline preparation can be customized for a domain and leverage the historical tuning data. No experience is wasted.
+
+## How to Use at Runtime
+
+The easiest way to leverage this technique is to import a "flamlized" learner of your favorite choice and use it just as how you use the learner before. The automation is done behind the scene and you are not required to change your code. For example, if you are currently using:
+
+```python
+from lightgbm import LGBMRegressor
+
+estimator = LGBMRegressor()
+estimator.fit(X_train, y_train)
+estimator.predict(X_test)
+```
+
+Simply replace the first line with:
+
+```python
+from flaml.default import LGBMRegressor
+```
+
+All the other code remains the same. And you are expected to get a equal or better model in most cases.
+
+The current list of "flamlized" learners are:
+* LGBMClassifier, LGBMRegressor.
+* XGBClassifier, XGBRegressor.
+* RandomForestClassifier, RandomForestRegressor.
+* ExtraTreesClassifier, ExtraTreesRegressor.
+
+### What's the magic behind the scene?
+
+`flaml.default.LGBMRegressor` inherits `lightgbm.LGBMRegressor`, so all the APIs in `lightgbm.LGBMRegressor` are still valid in `flaml.default.LGBMRegressor`. The difference is, `flaml.default.LGBMRegressor` decides the hyperparameter configurations based on the training data. It would use a different configuration if it is predicted to outperform the original data-independent default. If you inspect the params of the fitted estimator, you can find what configuration is used. If the original default configuration is used, then it is equivalent to the original estimator.
+
+The recommendation of which configuration should be used is based on offline AutoML run results. Information about the training dataset, such as the size of the dataset will be used to recommend a data-dependent configuration. The recommendation is done instantly in negligible time. The training can be faster or slower than using the original default configuration depending on the recommended configuration. Note that there is no tuning involved. Only one model is trained.
+
+### Can I check the configuration before training?
+
+Yes. You can use `suggest_hyperparams()` to find the suggested configuration. For example,
+
+```python
+from flaml.default import LGBMRegressor
+
+estimator = LGBMRegressor()
+hyperparams, estimator_name, X_transformed, y_transformed = estimator.suggest_hyperparams(X_train, y_train)
+print(hyperparams)
+```
+
+If you would like more control over the training, use an equivalent, open-box way for zero-shot AutoML. For example,
+
+```python
+from flaml.default import preprocess_and_suggest_hyperparams
+
+X, y = load_iris(return_X_y=True, as_frame=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+hyperparams, estimator_class, X_transformed, y_transformed, feature_transformer, label_transformer = preprocess_and_suggest_hyperparams(
+    "classification", X_train, y_train, "lgbm"
+)
+model = estimator_class(**hyperparams)  # estimator_class is lightgbm.LGBMClassifier
+model.fit(X_transformed, y_train)  # LGBMClassifier can handle raw labels
+X_test = feature_transformer.transform(X_test)  # preprocess test data
+y_pred = model.predict(X_test)
+```
+
+Note that some classifiers like XGBClassifier require the labels to be integers, while others do not. So you can decide whether to use the transformed labels `y_transformed` and the label transformer `label_transformer`.
+Also, each estimator may require specific preprocessing of the data. `X_transformed` is the preprocessed data, and `feature_transformer` is the preprocessor. It needs to be applied to the test data before prediction. These are automated when you use the "flamlized" learner. When you use the open-box way, pay attention to them.
+
+### Combine zero shot AutoML and hyperparameter tuning
+
+Zero Shot AutoML is fast. If tuning from the recommended data-dependent configuration is required, you can use `flaml.AutoML.fit()` and set `starting_points="data"`. For example,
+
+```python
+from flaml import AutoML
+automl = AutoML()
+automl_settings = {
+    "task": "classification",
+    "starting_points": "data",
+    "estimator_list": ["lgbm"],
+    "time_budget": 600,
+    "max_iter": 50,
+}
+automl.fit(X_train, y_train, **automl_settings)
+```
+
+Note that if you set `max_iter=0` and `time_budget=None`, you are effectively using zero-shot AutoML. When `estimator_list` is omitted, the estimator together with its hyperparameter configuration will be decided in a zero-shot manner.
+
+### Use your own meta-learned defaults
+
+To use your own meta-learned defaults, specify the path containing the meta-learned defaults. For example,
+
+```python
+estimator = flaml.default.LGBMRegressor(default_location="location_for_defaults")
+```
+
+Or,
+
+```python
+preprocess_and_suggest_hyperparams(
+    "classification", X_train, y_train, "lgbm", location="location_for_defaults"
+)
+```
+
+Or,
+
+```python
+X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+automl = AutoML()
+automl_settings = {
+    "task": "classification",
+    "log_file_name": "test/iris.log",
+    "starting_points": "data:location_for_defaults",
+    "estimator_list": ["lgbm", "xgb_limitdepth", "rf"]
+    "max_iter": 0,
+}
+automl.fit(X_train, y_train, **automl_settings)
+```
+
+Since this is a multiclass task, it will look for the following files under `{location_for_defaults}/`:
+
+- `all/multiclass.json`.
+- `{learner_name}/multiclass.json` for every learner_name in the estimator_list.
+
+Read the next section to understand how to generate these files if you would like to meta-learn the defaults yourself.
+
+## How to Prepare Offline
+
+This section is intended for:
+1. AutoML providers for a particular domain.
+1. Data scientists or engineers who need to repeatedly train models for similar tasks with varying training data.
+
+Instead of running full hyperparameter tuning from scratch every time, one can leverage the tuning experiences in similar tasks before. While we have offered the meta-learned defaults from tuning experiences of several popular learners on benchmark datasets for classification and regression, you can customize the defaults for your own tasks/learners/metrics based on your own tuning experiences.
+
+### Prepare a collection of training tasks
+
+Collect a diverse set of training tasks. For each task, extract its meta feature and save in a .csv file. For example, test/default/all/metafeatures.csv:
+
+```
+Dataset,NumberOfInstances,NumberOfFeatures,NumberOfClasses,PercentageOfNumericFeatures
+2dplanes,36691,10,0,1.0
+adult,43957,14,2,0.42857142857142855
+Airlines,485444,7,2,0.42857142857142855
+Albert,382716,78,2,0.3333333333333333
+Amazon_employee_access,29492,9,2,0.0
+bng_breastTumor,104976,9,0,0.1111111111111111
+bng_pbc,900000,18,0,0.5555555555555556
+car,1555,6,4,0.0
+connect-4,60801,42,3,0.0
+dilbert,9000,2000,5,1.0
+Dionis,374569,60,355,1.0
+poker,922509,10,0,1.0
+```
+
+The first column is the dataset name, and the latter four are meta features.
+
+### Prepare the candidate configurations
+
+You can extract the best configurations for each task in your collection of training tasks by running flaml on each of them with a long enough budget. Save the best configuration in a .json file under `{location_for_defaults}/{learner_name}/{task_name}.json`. For example,
+
+```python
+X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+automl.fit(X_train, y_train, estimator_list=["lgbm"], **settings)
+automl.save_best_config("test/default/lgbm/iris.json")
+```
+
+### Evaluate each candidate configuration on each task
+
+Save the evaluation results in a .csv file. For example, save the evaluation results for lgbm under `test/default/lgbm/results.csv`:
+
+```
+task,fold,type,result,params
+2dplanes,0,regression,0.946366,{'_modeljson': 'lgbm/2dplanes.json'}
+2dplanes,0,regression,0.907774,{'_modeljson': 'lgbm/adult.json'}
+2dplanes,0,regression,0.901643,{'_modeljson': 'lgbm/Airlines.json'}
+2dplanes,0,regression,0.915098,{'_modeljson': 'lgbm/Albert.json'}
+2dplanes,0,regression,0.302328,{'_modeljson': 'lgbm/Amazon_employee_access.json'}
+2dplanes,0,regression,0.94523,{'_modeljson': 'lgbm/bng_breastTumor.json'}
+2dplanes,0,regression,0.945698,{'_modeljson': 'lgbm/bng_pbc.json'}
+2dplanes,0,regression,0.946194,{'_modeljson': 'lgbm/car.json'}
+2dplanes,0,regression,0.945549,{'_modeljson': 'lgbm/connect-4.json'}
+2dplanes,0,regression,0.946232,{'_modeljson': 'lgbm/default.json'}
+2dplanes,0,regression,0.945594,{'_modeljson': 'lgbm/dilbert.json'}
+2dplanes,0,regression,0.836996,{'_modeljson': 'lgbm/Dionis.json'}
+2dplanes,0,regression,0.917152,{'_modeljson': 'lgbm/poker.json'}
+adult,0,binary,0.927203,{'_modeljson': 'lgbm/2dplanes.json'}
+adult,0,binary,0.932072,{'_modeljson': 'lgbm/adult.json'}
+adult,0,binary,0.926563,{'_modeljson': 'lgbm/Airlines.json'}
+adult,0,binary,0.928604,{'_modeljson': 'lgbm/Albert.json'}
+adult,0,binary,0.911171,{'_modeljson': 'lgbm/Amazon_employee_access.json'}
+adult,0,binary,0.930645,{'_modeljson': 'lgbm/bng_breastTumor.json'}
+adult,0,binary,0.928603,{'_modeljson': 'lgbm/bng_pbc.json'}
+adult,0,binary,0.915825,{'_modeljson': 'lgbm/car.json'}
+adult,0,binary,0.919499,{'_modeljson': 'lgbm/connect-4.json'}
+adult,0,binary,0.930109,{'_modeljson': 'lgbm/default.json'}
+adult,0,binary,0.932453,{'_modeljson': 'lgbm/dilbert.json'}
+adult,0,binary,0.921959,{'_modeljson': 'lgbm/Dionis.json'}
+adult,0,binary,0.910763,{'_modeljson': 'lgbm/poker.json'}
+...
+```
+
+The `type` column indicates the type of the task, such as regression, binary or multiclass.
+The `result` column stores the evaluation result, assumed the large the better. The `params` column indicates which json config is used. For example 'lgbm/2dplanes.json' indicates that the best lgbm configuration extracted from 2dplanes is used.
+Different types of tasks can appear in the same file, as long as any json config file can be used in all the tasks. For example, 'lgbm/2dplanes.json' is extracted from a regression task, and it can be applied to binary and multiclass tasks as well.
+
+### Learn data-dependent defaults
+
+To recap, the inputs required for meta-learning are:
+
+1. Metafeatures: e.g., `{location}/all/metafeatures.csv`.
+1. Configurations: `{location}/{learner_name}/{task_name}.json`.
+1. Evaluation results: `{location}/{learner_name}/results.csv`.
+
+For example, if the input location is "test/default", learners are lgbm, xgb_limitdepth and rf, the following command learns data-dependent defaults for binary classification tasks.
+
+```bash
+python portfolio.py --output test/default --input test/default --metafeatures test/default/all/metafeatures.csv --task binary --estimator lgbm xgb_limitdepth rf
+```
+
+In a few seconds, it will produce the following files as output:
+
+- test/default/lgbm/binary.json: the learned defaults for lgbm.
+- test/default/xgb_limitdepth/binary.json: the learned defaults for xgb_limitdepth.
+- test/default/rf/binary.json: the learned defaults for rf.
+- test/default/all/binary.json: the learned defaults for lgbm, xgb_limitdepth and rf together.
+
+Change "binary" into "multiclass" or "regression", or your own types in your "results.csv" for the other types of tasks. To update the learned defaults when more experiences are available, simply update your input files and rerun the learning command.
+
+### "Flamlize" a learner
+
+You have now effectively built your own zero-shot AutoML solution. Congratulations!
+
+Optionally, you can "flamlize" a learner using [`flaml.default.flamlize_estimator`](../reference/default/estimator#flamlize_estimator) for easy dissemination. For example,
+
+```python
+import sklearn.ensemble as ensemble
+from flaml.default import flamlize_estimator
+
+ExtraTreesClassifier = flamlize_estimator(
+    ensemble.ExtraTreesClassifier, "extra_tree", "classification"
+)
+```
+
+Then, you can share this "flamlized" `ExtraTreesClassifier` together with the location of your learned defaults with others (or the _future_ yourself). They will benefit from your past experience. Your group can also share experiences in a central place and update the learned defaults continuously. Over time, your organization gets better collectively.
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/BlendSearch.png b/website/versioned_docs/version-1.0.4/Use-Cases/images/BlendSearch.png
new file mode 100644
index 0000000000..db93d825f9
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/BlendSearch.png differ
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/CFO.png b/website/versioned_docs/version-1.0.4/Use-Cases/images/CFO.png
new file mode 100644
index 0000000000..bec6070e9a
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/CFO.png differ
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/curve.png b/website/versioned_docs/version-1.0.4/Use-Cases/images/curve.png
new file mode 100644
index 0000000000..a421b0cd32
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/curve.png differ
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/feature_importance.png b/website/versioned_docs/version-1.0.4/Use-Cases/images/feature_importance.png
new file mode 100644
index 0000000000..3b1c361730
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/feature_importance.png differ
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_cost_cfo_12s.gif b/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_cost_cfo_12s.gif
new file mode 100644
index 0000000000..5093f9c80e
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_cost_cfo_12s.gif differ
diff --git a/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_loss_cfo_12s.gif b/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_loss_cfo_12s.gif
new file mode 100644
index 0000000000..9cc0968b49
Binary files /dev/null and b/website/versioned_docs/version-1.0.4/Use-Cases/images/heatmap_loss_cfo_12s.gif differ
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/automl.md b/website/versioned_docs/version-1.0.4/reference/automl/automl.md
new file mode 100644
index 0000000000..1795e961dd
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/automl.md
@@ -0,0 +1,1067 @@
+---
+sidebar_label: automl
+title: automl.automl
+---
+
+## AutoMLState Objects
+
+```python
+class AutoMLState()
+```
+
+#### sanitize
+
+```python
+@classmethod
+def sanitize(cls, config: dict) -> dict
+```
+
+Make a config ready for passing to estimator.
+
+#### size
+
+```python
+def size(state: AutoMLState, config: dict) -> float
+```
+
+Size function.
+
+**Returns**:
+
+  The mem size in bytes for a config.
+
+## AutoML Objects
+
+```python
+class AutoML(BaseEstimator)
+```
+
+The AutoML class.
+
+**Example**:
+
+  
+```python
+automl = AutoML()
+automl_settings = {
+    "time_budget": 60,
+    "metric": 'accuracy',
+    "task": 'classification',
+    "log_file_name": 'mylog.log',
+}
+automl.fit(X_train = X_train, y_train = y_train, **automl_settings)
+```
+
+#### \_\_init\_\_
+
+```python
+def __init__(**settings)
+```
+
+Constructor.
+
+Many settings in fit() can be passed to the constructor too.
+If an argument in fit() is provided, it will override the setting passed to the constructor.
+If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.
+
+**Arguments**:
+
+- `metric` - A string of the metric name or a function,
+  e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_weighted',
+  'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'f1', 'micro_f1', 'macro_f1',
+  'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'.
+  If passing a customized metric function, the function needs to
+  have the following input arguments:
+  
+```python
+def custom_metric(
+    X_test, y_test, estimator, labels,
+    X_train, y_train, weight_test=None, weight_train=None,
+    config=None, groups_test=None, groups_train=None,
+):
+    return metric_to_minimize, metrics_to_log
+```
+  which returns a float number as the minimization objective,
+  and a dictionary as the metrics to log. E.g.,
+  
+```python
+def custom_metric(
+    X_val, y_val, estimator, labels,
+    X_train, y_train, weight_val=None, weight_train=None,
+    *args,
+):
+    from sklearn.metrics import log_loss
+    import time
+
+    start = time.time()
+    y_pred = estimator.predict_proba(X_val)
+    pred_time = (time.time() - start) / len(X_val)
+    val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+    y_pred = estimator.predict_proba(X_train)
+    train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+    alpha = 0.5
+    return val_loss * (1 + alpha) - alpha * train_loss, {
+        "val_loss": val_loss,
+        "train_loss": train_loss,
+        "pred_time": pred_time,
+    }
+```
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast', 'rank',
+  'seq-classification', 'seq-regression', 'summarization'.
+- `n_jobs` - An integer of the number of threads for training | default=-1.
+  Use all available resources when n_jobs == -1.
+- `log_file_name` - A string of the log file name | default="". To disable logging,
+  set it to be an empty string "".
+- `estimator_list` - A list of strings for estimator names, or 'auto'.
+  e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```.
+- `time_budget` - A float number of the time budget in seconds.
+  Use -1 if no time limit.
+- `max_iter` - An integer of the maximal number of iterations.
+- `sample` - A boolean of whether to sample the training data during
+  search.
+- `ensemble` - boolean or dict | default=False. Whether to perform
+  ensemble after search. Can be a dict with keys 'passthrough'
+  and 'final_estimator' to specify the passthrough and
+  final_estimator in the stacker. The dict can also contain
+  'n_jobs' as the key to specify the number of jobs for the stacker.
+- `eval_method` - A string of resampling strategy, one of
+  ['auto', 'cv', 'holdout'].
+- `split_ratio` - A float of the valiation data percentage for holdout.
+- `n_splits` - An integer of the number of folds for cross - validation.
+- `log_type` - A string of the log type, one of
+  ['better', 'all'].
+  'better' only logs configs with better loss than previos iters
+  'all' logs all the tried configs.
+- `model_history` - A boolean of whether to keep the best
+  model per estimator. Make sure memory is large enough if setting to True.
+- `log_training_metric` - A boolean of whether to log the training
+  metric for each model.
+- `mem_thres` - A float of the memory size constraint in bytes.
+- `pred_time_limit` - A float of the prediction latency constraint in seconds.
+  It refers to the average prediction time per row in validation data.
+- `train_time_limit` - A float of the training time constraint in seconds.
+- `verbose` - int, default=3 | Controls the verbosity, higher means more
+  messages.
+- `retrain_full` - bool or str, default=True | whether to retrain the
+  selected model on the full training data when using holdout.
+  True - retrain only after search finishes; False - no retraining;
+  'budget' - do best effort to retrain without violating the time
+  budget.
+- `split_type` - str or splitter object, default="auto" | the data split type.
+  * A valid splitter object is an instance of a derived class of scikit-learn
+  [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+  and have ``split`` and ``get_n_splits`` methods with the same signatures.
+  Set eval_method to "cv" to use the splitter object.
+  * Valid str options depend on different tasks.
+  For classification tasks, valid choices are
+  ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+  For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+  "auto" -> uniform.
+  For time series forecast tasks, must be "auto" or 'time'.
+  For ranking task, must be "auto" or 'group'.
+- `hpo_method` - str, default="auto" | The hyperparameter
+  optimization method. By default, CFO is used for sequential
+  search and BlendSearch is used for parallel search.
+  No need to set when using flaml's default search space or using
+  a simple customized search space. When set to 'bs', BlendSearch
+  is used. BlendSearch can be tried when the search space is
+  complex, for example, containing multiple disjoint, discontinuous
+  subspaces. When set to 'random', random search is used.
+- `starting_points` - A dictionary or a str to specify the starting hyperparameter
+  config for the estimators | default="static".
+  If str:
+  - if "data", use data-dependent defaults;
+  - if "data:path" use data-dependent defaults which are stored at path;
+  - if "static", use data-independent defaults.
+  If dict, keys are the name of the estimators, and values are the starting
+  hyperparamter configurations for the corresponding estimators.
+  The value can be a single hyperparamter configuration dict or a list
+  of hyperparamter configuration dicts.
+  In the following code example, we get starting_points from the
+  `automl` object and use them in the `new_automl` object.
+  e.g.,
+  
+```python
+from flaml import AutoML
+automl = AutoML()
+X_train, y_train = load_iris(return_X_y=True)
+automl.fit(X_train, y_train)
+starting_points = automl.best_config_per_estimator
+
+new_automl = AutoML()
+new_automl.fit(X_train, y_train, starting_points=starting_points)
+```
+  
+- `seed` - int or None, default=None | The random seed for hpo.
+- `n_concurrent_trials` - [Experimental] int, default=1 | The number of
+  concurrent trials. When n_concurrent_trials > 1, flaml performes
+  [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
+  and installation of ray is required: `pip install flaml[ray]`.
+- `keep_search_state` - boolean, default=False | Whether to keep data needed
+  for model search after fit(). By default the state is deleted for
+  space saving.
+- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint
+  on disk when deleting automl. By default the checkpoint is preserved.
+- `early_stop` - boolean, default=False | Whether to stop early if the
+  search is considered to converge.
+- `append_log` - boolean, default=False | Whetehr to directly append the log
+  records to the input log file if it exists.
+- `auto_augment` - boolean, default=True | Whether to automatically
+  augment rare classes.
+- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample
+  size when sample=True.
+- `use_ray` - boolean or dict.
+  If boolean: default=False | Whether to use ray to run the training
+  in separate processes. This can be used to prevent OOM for large
+  datasets, but will incur more overhead in time.
+  If dict: the dict contains the keywords arguments to be passed to
+  [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
+- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.
+- `metric_constraints` - list, default=[] | The list of metric constraints.
+  Each element in this list is a 3-tuple, which shall be expressed
+  in the following format: the first element of the 3-tuple is the name of the
+  metric, the second element is the inequality sign chosen from ">=" and "<=",
+  and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.
+  Note that all the metric names in metric_constraints need to be reported via
+  the metrics_to_log dictionary returned by a customized metric function.
+  The customized metric function shall be provided via the `metric` key word
+  argument of the fit() function or the automl constructor.
+  Find an example in the 4th constraint type in this [doc](../../Use-Cases/Task-Oriented-AutoML#constraint).
+  If `pred_time_limit` is provided as one of keyword arguments to fit() function or
+  the automl constructor, flaml will automatically (and under the hood)
+  add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
+  specifies a constraint about the prediction latency constraint in seconds.
+- `custom_hp` - dict, default=None | The custom search space specified by user.
+  It is a nested dict with keys being the estimator names, and values being dicts
+  per estimator search space. In the per estimator search space dict,
+  the keys are the hyperparameter names, and values are dicts of info ("domain",
+  "init_value", and "low_cost_init_value") about the search space associated with
+  the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp
+  is provided, the built-in search space which is also a nested dict of per estimator
+  search space dict, will be updated with custom_hp. Note that during this nested dict update,
+  the per hyperparameter search space dicts will be replaced (instead of updated) by the ones
+  provided in custom_hp. Note that the value for "domain" can either be a constant
+  or a sample.Domain object.
+  e.g.,
+  
+```python
+custom_hp = {
+     "transformer_ms": {
+         "model_path": {
+             "domain": "albert-base-v2",
+         },
+         "learning_rate": {
+             "domain": tune.choice([1e-4, 1e-5]),
+         }
+     }
+ }
+```
+- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.
+- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.
+  e.g.,
+  
+```python
+fit_kwargs_by_estimator = {
+    "transformer": {
+        "output_dir": "test/data/output/",
+        "fp16": False,
+    }
+}
+```
+
+#### config\_history
+
+```python
+@property
+def config_history()
+```
+
+A dictionary of iter->(estimator, config, time),
+storing the best estimator, config, and the time when the best
+model is updated each time.
+
+#### model
+
+```python
+@property
+def model()
+```
+
+An object with `predict()` and `predict_proba()` method (for
+classification), storing the best trained model.
+
+#### best\_model\_for\_estimator
+
+```python
+def best_model_for_estimator(estimator_name)
+```
+
+Return the best model found for a particular estimator.
+
+**Arguments**:
+
+- `estimator_name` - a str of the estimator's name.
+  
+
+**Returns**:
+
+  An object storing the best model for estimator_name.
+  If `model_history` was set to False during fit(), then the returned model
+  is untrained unless estimator_name is the best estimator.
+  If `model_history` was set to True, then the returned model is trained.
+
+#### best\_estimator
+
+```python
+@property
+def best_estimator()
+```
+
+A string indicating the best estimator found.
+
+#### best\_iteration
+
+```python
+@property
+def best_iteration()
+```
+
+An integer of the iteration number where the best
+config is found.
+
+#### best\_config
+
+```python
+@property
+def best_config()
+```
+
+A dictionary of the best configuration.
+
+#### best\_config\_per\_estimator
+
+```python
+@property
+def best_config_per_estimator()
+```
+
+A dictionary of all estimators' best configuration.
+
+#### best\_loss\_per\_estimator
+
+```python
+@property
+def best_loss_per_estimator()
+```
+
+A dictionary of all estimators' best loss.
+
+#### best\_loss
+
+```python
+@property
+def best_loss()
+```
+
+A float of the best loss found.
+
+#### best\_result
+
+```python
+@property
+def best_result()
+```
+
+Result dictionary for model trained with the best config.
+
+#### metrics\_for\_best\_config
+
+```python
+@property
+def metrics_for_best_config()
+```
+
+Returns a float of the best loss, and a dictionary of the auxiliary metrics to log
+associated with the best config. These two objects correspond to the returned
+objects by the customized metric function for the config with the best loss.
+
+#### best\_config\_train\_time
+
+```python
+@property
+def best_config_train_time()
+```
+
+A float of the seconds taken by training the best config.
+
+#### classes\_
+
+```python
+@property
+def classes_()
+```
+
+A numpy array of shape (n_classes,) for class labels.
+
+#### time\_to\_find\_best\_model
+
+```python
+@property
+def time_to_find_best_model() -> float
+```
+
+Time taken to find best model in seconds.
+
+#### predict
+
+```python
+def predict(X: Union[np.array, pd.DataFrame, List[str], List[List[str]]], **pred_kwargs, ,)
+```
+
+Predict label from features.
+
+**Arguments**:
+
+- `X` - A numpy array of featurized instances, shape n * m,
+  or for time series forcast tasks:
+  a pandas dataframe with the first column containing
+  timestamp values (datetime type) or an integer n for
+  the predict steps (only valid when the estimator is
+  arima or sarimax). Other columns in the dataframe
+  are assumed to be exogenous variables (categorical
+  or numeric).
+- `**pred_kwargs` - Other key word arguments to pass to predict() function of
+  the searched learners, such as per_device_eval_batch_size.
+  
+```python
+multivariate_X_test = pd.DataFrame({
+    'timeStamp': pd.date_range(start='1/1/2022', end='1/07/2022'),
+    'categorical_col': ['yes', 'yes', 'no', 'no', 'yes', 'no', 'yes'],
+    'continuous_col': [105, 107, 120, 118, 110, 112, 115]
+})
+model.predict(multivariate_X_test)
+```
+  
+
+**Returns**:
+
+  A array-like of shape n * 1: each element is a predicted
+  label for an instance.
+
+#### predict\_proba
+
+```python
+def predict_proba(X, **pred_kwargs)
+```
+
+Predict the probability of each class from features, only works for
+classification problems.
+
+**Arguments**:
+
+- `X` - A numpy array of featurized instances, shape n * m.
+- `**pred_kwargs` - Other key word arguments to pass to predict_proba() function of
+  the searched learners, such as per_device_eval_batch_size.
+  
+
+**Returns**:
+
+  A numpy array of shape n * c. c is the  # classes. Each element at
+  (i, j) is the probability for instance i to be in class j.
+
+#### add\_learner
+
+```python
+def add_learner(learner_name, learner_class)
+```
+
+Add a customized learner.
+
+**Arguments**:
+
+- `learner_name` - A string of the learner's name.
+- `learner_class` - A subclass of flaml.model.BaseEstimator.
+
+#### get\_estimator\_from\_log
+
+```python
+def get_estimator_from_log(log_file_name, record_id, task)
+```
+
+Get the estimator from log file.
+
+**Arguments**:
+
+- `log_file_name` - A string of the log file name.
+- `record_id` - An integer of the record ID in the file,
+  0 corresponds to the first trial.
+- `task` - A string of the task type,
+  'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'.
+  
+
+**Returns**:
+
+  An estimator object for the given configuration.
+
+#### retrain\_from\_log
+
+```python
+def retrain_from_log(log_file_name, X_train=None, y_train=None, dataframe=None, label=None, time_budget=np.inf, task=None, eval_method=None, split_ratio=None, n_splits=None, split_type=None, groups=None, n_jobs=-1, train_best=True, train_full=False, record_id=-1, auto_augment=None, custom_hp=None, skip_transform=None, preserve_checkpoint=True, fit_kwargs_by_estimator=None, **fit_kwargs, ,)
+```
+
+Retrain from log file.
+
+This function is intended to retrain the logged configurations.
+NOTE: In some rare case, the last config is early stopped to meet time_budget and it's the best config.
+But the logged config's ITER_HP (e.g., n_estimators) is not reduced.
+
+**Arguments**:
+
+- `log_file_name` - A string of the log file name.
+- `X_train` - A numpy array or dataframe of training data in shape n*m.
+  For time series forecast tasks, the first column of X_train
+  must be the timestamp column (datetime type). Other
+  columns in the dataframe are assumed to be exogenous
+  variables (categorical or numeric).
+- `y_train` - A numpy array or series of labels in shape n*1.
+- `dataframe` - A dataframe of training data including label column.
+  For time series forecast tasks, dataframe must be specified and should
+  have at least two columns: timestamp and label, where the first
+  column is the timestamp column (datetime type). Other columns
+  in the dataframe are assumed to be exogenous variables
+  (categorical or numeric).
+- `label` - A str of the label column name, e.g., 'label';
+- `Note` - If X_train and y_train are provided,
+  dataframe and label are ignored;
+  If not, dataframe and label must be provided.
+- `time_budget` - A float number of the time budget in seconds.
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast', 'rank',
+  'seq-classification', 'seq-regression', 'summarization'.
+- `eval_method` - A string of resampling strategy, one of
+  ['auto', 'cv', 'holdout'].
+- `split_ratio` - A float of the validation data percentage for holdout.
+- `n_splits` - An integer of the number of folds for cross-validation.
+- `split_type` - str or splitter object, default="auto" | the data split type.
+  * A valid splitter object is an instance of a derived class of scikit-learn
+  [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+  and have ``split`` and ``get_n_splits`` methods with the same signatures.
+  Set eval_method to "cv" to use the splitter object.
+  * Valid str options depend on different tasks.
+  For classification tasks, valid choices are
+  ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+  For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+  "auto" -> uniform.
+  For time series forecast tasks, must be "auto" or 'time'.
+  For ranking task, must be "auto" or 'group'.
+- `groups` - None or array-like | Group labels (with matching length to
+  y_train) or groups counts (with sum equal to length of y_train)
+  for training data.
+- `n_jobs` - An integer of the number of threads for training | default=-1.
+  Use all available resources when n_jobs == -1.
+- `train_best` - A boolean of whether to train the best config in the
+  time budget; if false, train the last config in the budget.
+- `train_full` - A boolean of whether to train on the full data. If true,
+  eval_method and sample_size in the log file will be ignored.
+- `record_id` - the ID of the training log record from which the model will
+  be retrained. By default `record_id = -1` which means this will be
+  ignored. `record_id = 0` corresponds to the first trial, and
+  when `record_id >= 0`, `time_budget` will be ignored.
+- `auto_augment` - boolean, default=True | Whether to automatically
+  augment rare classes.
+- `custom_hp` - dict, default=None | The custom search space specified by user
+  Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
+  domain of the custom search space can either be a value or a sample.Domain object.
+  
+```python
+custom_hp = {
+    "transformer_ms": {
+        "model_path": {
+            "domain": "albert-base-v2",
+        },
+        "learning_rate": {
+            "domain": tune.choice([1e-4, 1e-5]),
+        }
+    }
+}
+```
+- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.
+  e.g.,
+  
+```python
+fit_kwargs_by_estimator = {
+    "transformer": {
+        "output_dir": "test/data/output/",
+        "fp16": False,
+    }
+}
+```
+  
+- `**fit_kwargs` - Other key word arguments to pass to fit() function of
+  the searched learners, such as sample_weight. Below are a few examples of
+  estimator-specific parameters:
+- `period` - int | forecast horizon for all time series forecast tasks.
+- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,
+  only used by TransformersEstimator, XGBoostSklearnEstimator, and
+  TemporalFusionTransformerEstimator.
+- `group_ids` - list of strings of column names identifying a time series, only
+  used by TemporalFusionTransformerEstimator, required for
+  'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
+  from PyTorchForecasting.
+  For other parameters to describe your dataset, refer to
+  [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
+  To specify your variables, use `static_categoricals`, `static_reals`,
+  `time_varying_known_categoricals`, `time_varying_known_reals`,
+  `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
+  `variable_groups`. To provide more information on your data, use
+  `max_encoder_length`, `min_encoder_length`, `lags`.
+- `log_dir` - str, default = "lightning_logs" | Folder into which to log results
+  for tensorboard, only used by TemporalFusionTransformerEstimator.
+- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,
+  only used by TemporalFusionTransformerEstimator.
+- `batch_size` - int, default = 64 | Batch size for training model, only
+  used by TemporalFusionTransformerEstimator.
+
+#### search\_space
+
+```python
+@property
+def search_space() -> dict
+```
+
+Search space.
+
+Must be called after fit(...)
+(use max_iter=0 and retrain_final=False to prevent actual fitting).
+
+**Returns**:
+
+  A dict of the search space.
+
+#### low\_cost\_partial\_config
+
+```python
+@property
+def low_cost_partial_config() -> dict
+```
+
+Low cost partial config.
+
+**Returns**:
+
+  A dict.
+  (a) if there is only one estimator in estimator_list, each key is a
+  hyperparameter name.
+  (b) otherwise, it is a nested dict with 'ml' as the key, and
+  a list of the low_cost_partial_configs as the value, corresponding
+  to each learner's low_cost_partial_config; the estimator index as
+  an integer corresponding to the cheapest learner is appended to the
+  list at the end.
+
+#### cat\_hp\_cost
+
+```python
+@property
+def cat_hp_cost() -> dict
+```
+
+Categorical hyperparameter cost
+
+**Returns**:
+
+  A dict.
+  (a) if there is only one estimator in estimator_list, each key is a
+  hyperparameter name.
+  (b) otherwise, it is a nested dict with 'ml' as the key, and
+  a list of the cat_hp_cost's as the value, corresponding
+  to each learner's cat_hp_cost; the cost relative to lgbm for each
+  learner (as a list itself) is appended to the list at the end.
+
+#### points\_to\_evaluate
+
+```python
+@property
+def points_to_evaluate() -> dict
+```
+
+Initial points to evaluate.
+
+**Returns**:
+
+  A list of dicts. Each dict is the initial point for each learner.
+
+#### resource\_attr
+
+```python
+@property
+def resource_attr() -> Optional[str]
+```
+
+Attribute of the resource dimension.
+
+**Returns**:
+
+  A string for the sample size attribute
+  (the resource attribute in AutoML) or None.
+
+#### min\_resource
+
+```python
+@property
+def min_resource() -> Optional[float]
+```
+
+Attribute for pruning.
+
+**Returns**:
+
+  A float for the minimal sample size or None.
+
+#### max\_resource
+
+```python
+@property
+def max_resource() -> Optional[float]
+```
+
+Attribute for pruning.
+
+**Returns**:
+
+  A float for the maximal sample size or None.
+
+#### trainable
+
+```python
+@property
+def trainable() -> Callable[[dict], Optional[float]]
+```
+
+Training function.
+
+**Returns**:
+
+  A function that evaluates each config and returns the loss.
+
+#### metric\_constraints
+
+```python
+@property
+def metric_constraints() -> list
+```
+
+Metric constraints.
+
+**Returns**:
+
+  A list of the metric constraints.
+
+#### fit
+
+```python
+def fit(X_train=None, y_train=None, dataframe=None, label=None, metric=None, task=None, n_jobs=None, log_file_name=None, estimator_list=None, time_budget=None, max_iter=None, sample=None, ensemble=None, eval_method=None, log_type=None, model_history=None, split_ratio=None, n_splits=None, log_training_metric=None, mem_thres=None, pred_time_limit=None, train_time_limit=None, X_val=None, y_val=None, sample_weight_val=None, groups_val=None, groups=None, verbose=None, retrain_full=None, split_type=None, learner_selector=None, hpo_method=None, starting_points=None, seed=None, n_concurrent_trials=None, keep_search_state=None, preserve_checkpoint=True, early_stop=None, append_log=None, auto_augment=None, min_sample_size=None, use_ray=None, free_mem_ratio=0, metric_constraints=None, custom_hp=None, cv_score_agg_func=None, skip_transform=None, fit_kwargs_by_estimator=None, **fit_kwargs, ,)
+```
+
+Find a model for a given task.
+
+**Arguments**:
+
+- `X_train` - A numpy array or a pandas dataframe of training data in
+  shape (n, m). For time series forecsat tasks, the first column of X_train
+  must be the timestamp column (datetime type). Other columns in
+  the dataframe are assumed to be exogenous variables (categorical or numeric).
+  When using ray, X_train can be a ray.ObjectRef.
+- `y_train` - A numpy array or a pandas series of labels in shape (n, ).
+- `dataframe` - A dataframe of training data including label column.
+  For time series forecast tasks, dataframe must be specified and must have
+  at least two columns, timestamp and label, where the first
+  column is the timestamp column (datetime type). Other columns in
+  the dataframe are assumed to be exogenous variables (categorical or numeric).
+  When using ray, dataframe can be a ray.ObjectRef.
+- `label` - A str of the label column name for, e.g., 'label';
+- `Note` - If X_train and y_train are provided,
+  dataframe and label are ignored;
+  If not, dataframe and label must be provided.
+- `metric` - A string of the metric name or a function,
+  e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_weighted',
+  'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'f1', 'micro_f1', 'macro_f1',
+  'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'.
+  If passing a customized metric function, the function needs to
+  have the following input arguments:
+  
+```python
+def custom_metric(
+    X_test, y_test, estimator, labels,
+    X_train, y_train, weight_test=None, weight_train=None,
+    config=None, groups_test=None, groups_train=None,
+):
+    return metric_to_minimize, metrics_to_log
+```
+  which returns a float number as the minimization objective,
+  and a dictionary as the metrics to log. E.g.,
+  
+```python
+def custom_metric(
+    X_val, y_val, estimator, labels,
+    X_train, y_train, weight_val=None, weight_train=None,
+    *args,
+):
+    from sklearn.metrics import log_loss
+    import time
+
+    start = time.time()
+    y_pred = estimator.predict_proba(X_val)
+    pred_time = (time.time() - start) / len(X_val)
+    val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+    y_pred = estimator.predict_proba(X_train)
+    train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+    alpha = 0.5
+    return val_loss * (1 + alpha) - alpha * train_loss, {
+        "val_loss": val_loss,
+        "train_loss": train_loss,
+        "pred_time": pred_time,
+    }
+```
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast_regression',
+  'ts_forecast_classification', 'rank', 'seq-classification',
+  'seq-regression', 'summarization'.
+- `n_jobs` - An integer of the number of threads for training | default=-1.
+  Use all available resources when n_jobs == -1.
+- `log_file_name` - A string of the log file name | default="". To disable logging,
+  set it to be an empty string "".
+- `estimator_list` - A list of strings for estimator names, or 'auto'.
+  e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```.
+- `time_budget` - A float number of the time budget in seconds.
+  Use -1 if no time limit.
+- `max_iter` - An integer of the maximal number of iterations.
+- `NOTE` - when both time_budget and max_iter are unspecified,
+  only one model will be trained per estimator.
+- `sample` - A boolean of whether to sample the training data during
+  search.
+- `ensemble` - boolean or dict | default=False. Whether to perform
+  ensemble after search. Can be a dict with keys 'passthrough'
+  and 'final_estimator' to specify the passthrough and
+  final_estimator in the stacker. The dict can also contain
+  'n_jobs' as the key to specify the number of jobs for the stacker.
+- `eval_method` - A string of resampling strategy, one of
+  ['auto', 'cv', 'holdout'].
+- `split_ratio` - A float of the valiation data percentage for holdout.
+- `n_splits` - An integer of the number of folds for cross - validation.
+- `log_type` - A string of the log type, one of
+  ['better', 'all'].
+  'better' only logs configs with better loss than previos iters
+  'all' logs all the tried configs.
+- `model_history` - A boolean of whether to keep the trained best
+  model per estimator. Make sure memory is large enough if setting to True.
+  Default value is False: best_model_for_estimator would return a
+  untrained model for non-best learner.
+- `log_training_metric` - A boolean of whether to log the training
+  metric for each model.
+- `mem_thres` - A float of the memory size constraint in bytes.
+- `pred_time_limit` - A float of the prediction latency constraint in seconds.
+  It refers to the average prediction time per row in validation data.
+- `train_time_limit` - None or a float of the training time constraint in seconds.
+- `X_val` - None or a numpy array or a pandas dataframe of validation data.
+- `y_val` - None or a numpy array or a pandas series of validation labels.
+- `sample_weight_val` - None or a numpy array of the sample weight of
+  validation data of the same shape as y_val.
+- `groups_val` - None or array-like | group labels (with matching length
+  to y_val) or group counts (with sum equal to length of y_val)
+  for validation data. Need to be consistent with groups.
+- `groups` - None or array-like | Group labels (with matching length to
+  y_train) or groups counts (with sum equal to length of y_train)
+  for training data.
+- `verbose` - int, default=3 | Controls the verbosity, higher means more
+  messages.
+- `retrain_full` - bool or str, default=True | whether to retrain the
+  selected model on the full training data when using holdout.
+  True - retrain only after search finishes; False - no retraining;
+  'budget' - do best effort to retrain without violating the time
+  budget.
+- `split_type` - str or splitter object, default="auto" | the data split type.
+  * A valid splitter object is an instance of a derived class of scikit-learn
+  [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
+  and have ``split`` and ``get_n_splits`` methods with the same signatures.
+  Set eval_method to "cv" to use the splitter object.
+  * Valid str options depend on different tasks.
+  For classification tasks, valid choices are
+  ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
+  For regression tasks, valid choices are ["auto", 'uniform', 'time'].
+  "auto" -> uniform.
+  For time series forecast tasks, must be "auto" or 'time'.
+  For ranking task, must be "auto" or 'group'.
+- `hpo_method` - str, default="auto" | The hyperparameter
+  optimization method. By default, CFO is used for sequential
+  search and BlendSearch is used for parallel search.
+  No need to set when using flaml's default search space or using
+  a simple customized search space. When set to 'bs', BlendSearch
+  is used. BlendSearch can be tried when the search space is
+  complex, for example, containing multiple disjoint, discontinuous
+  subspaces. When set to 'random', random search is used.
+- `starting_points` - A dictionary or a str to specify the starting hyperparameter
+  config for the estimators | default="data".
+  If str:
+  - if "data", use data-dependent defaults;
+  - if "data:path" use data-dependent defaults which are stored at path;
+  - if "static", use data-independent defaults.
+  If dict, keys are the name of the estimators, and values are the starting
+  hyperparamter configurations for the corresponding estimators.
+  The value can be a single hyperparamter configuration dict or a list
+  of hyperparamter configuration dicts.
+  In the following code example, we get starting_points from the
+  `automl` object and use them in the `new_automl` object.
+  e.g.,
+  
+```python
+from flaml import AutoML
+automl = AutoML()
+X_train, y_train = load_iris(return_X_y=True)
+automl.fit(X_train, y_train)
+starting_points = automl.best_config_per_estimator
+
+new_automl = AutoML()
+new_automl.fit(X_train, y_train, starting_points=starting_points)
+```
+  
+- `seed` - int or None, default=None | The random seed for hpo.
+- `n_concurrent_trials` - [Experimental] int, default=1 | The number of
+  concurrent trials. When n_concurrent_trials > 1, flaml performes
+  [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
+  and installation of ray is required: `pip install flaml[ray]`.
+- `keep_search_state` - boolean, default=False | Whether to keep data needed
+  for model search after fit(). By default the state is deleted for
+  space saving.
+- `preserve_checkpoint` - boolean, default=True | Whether to preserve the saved checkpoint
+  on disk when deleting automl. By default the checkpoint is preserved.
+- `early_stop` - boolean, default=False | Whether to stop early if the
+  search is considered to converge.
+- `append_log` - boolean, default=False | Whetehr to directly append the log
+  records to the input log file if it exists.
+- `auto_augment` - boolean, default=True | Whether to automatically
+  augment rare classes.
+- `min_sample_size` - int, default=MIN_SAMPLE_TRAIN | the minimal sample
+  size when sample=True.
+- `use_ray` - boolean or dict.
+  If boolean: default=False | Whether to use ray to run the training
+  in separate processes. This can be used to prevent OOM for large
+  datasets, but will incur more overhead in time.
+  If dict: the dict contains the keywords arguments to be passed to
+  [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
+- `free_mem_ratio` - float between 0 and 1, default=0. The free memory ratio to keep during training.
+- `metric_constraints` - list, default=[] | The list of metric constraints.
+  Each element in this list is a 3-tuple, which shall be expressed
+  in the following format: the first element of the 3-tuple is the name of the
+  metric, the second element is the inequality sign chosen from ">=" and "<=",
+  and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`.
+  Note that all the metric names in metric_constraints need to be reported via
+  the metrics_to_log dictionary returned by a customized metric function.
+  The customized metric function shall be provided via the `metric` key word argument
+  of the fit() function or the automl constructor.
+  Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py).
+  If `pred_time_limit` is provided as one of keyword arguments to fit() function or
+  the automl constructor, flaml will automatically (and under the hood)
+  add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
+  specifies a constraint about the prediction latency constraint in seconds.
+- `custom_hp` - dict, default=None | The custom search space specified by user
+  Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
+  domain of the custom search space can either be a value of a sample.Domain object.
+  
+  
+  
+```python
+custom_hp = {
+    "transformer_ms": {
+        "model_path": {
+            "domain": "albert-base-v2",
+        },
+        "learning_rate": {
+            "domain": tune.choice([1e-4, 1e-5]),
+        }
+    }
+}
+```
+  
+- `cv_score_agg_func` - customized cross-validation scores aggregate function. Default to average metrics across folds. If specificed, this function needs to
+  have the following input arguments:
+  
+  * val_loss_folds: list of floats, the loss scores of each fold;
+  * log_metrics_folds: list of dicts/floats, the metrics of each fold to log.
+  
+  This function should return the final aggregate result of all folds. A float number of the minimization objective, and a dictionary as the metrics to log or None.
+  E.g.,
+  
+```python
+def cv_score_agg_func(val_loss_folds, log_metrics_folds):
+    metric_to_minimize = sum(val_loss_folds)/len(val_loss_folds)
+    metrics_to_log = None
+    for single_fold in log_metrics_folds:
+        if metrics_to_log is None:
+            metrics_to_log = single_fold
+        elif isinstance(metrics_to_log, dict):
+            metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold.items()}
+        else:
+            metrics_to_log += single_fold
+    if metrics_to_log:
+        n = len(val_loss_folds)
+        metrics_to_log = (
+            {k: v / n for k, v in metrics_to_log.items()}
+            if isinstance(metrics_to_log, dict)
+            else metrics_to_log / n
+        )
+    return metric_to_minimize, metrics_to_log
+```
+  
+- `skip_transform` - boolean, default=False | Whether to pre-process data prior to modeling.
+- `fit_kwargs_by_estimator` - dict, default=None | The user specified keywords arguments, grouped by estimator name.
+  For TransformersEstimator, available fit_kwargs can be found from
+  [TrainingArgumentsForAuto](nlp/huggingface/training_args).
+  e.g.,
+  
+```python
+fit_kwargs_by_estimator = {
+    "transformer": {
+        "output_dir": "test/data/output/",
+        "fp16": False,
+    },
+    "tft": {
+        "max_encoder_length": 1,
+        "min_encoder_length": 1,
+        "static_categoricals": [],
+        "static_reals": [],
+        "time_varying_known_categoricals": [],
+        "time_varying_known_reals": [],
+        "time_varying_unknown_categoricals": [],
+        "time_varying_unknown_reals": [],
+        "variable_groups": {},
+        "lags": {},
+    }
+}
+```
+  
+- `**fit_kwargs` - Other key word arguments to pass to fit() function of
+  the searched learners, such as sample_weight. Below are a few examples of
+  estimator-specific parameters:
+- `period` - int | forecast horizon for all time series forecast tasks.
+- `gpu_per_trial` - float, default = 0 | A float of the number of gpus per trial,
+  only used by TransformersEstimator, XGBoostSklearnEstimator, and
+  TemporalFusionTransformerEstimator.
+- `group_ids` - list of strings of column names identifying a time series, only
+  used by TemporalFusionTransformerEstimator, required for
+  'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
+  from PyTorchForecasting.
+  For other parameters to describe your dataset, refer to
+  [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
+  To specify your variables, use `static_categoricals`, `static_reals`,
+  `time_varying_known_categoricals`, `time_varying_known_reals`,
+  `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
+  `variable_groups`. To provide more information on your data, use
+  `max_encoder_length`, `min_encoder_length`, `lags`.
+- `log_dir` - str, default = "lightning_logs" | Folder into which to log results
+  for tensorboard, only used by TemporalFusionTransformerEstimator.
+- `max_epochs` - int, default = 20 | Maximum number of epochs to run training,
+  only used by TemporalFusionTransformerEstimator.
+- `batch_size` - int, default = 64 | Batch size for training model, only
+  used by TemporalFusionTransformerEstimator.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/data.md b/website/versioned_docs/version-1.0.4/reference/automl/data.md
new file mode 100644
index 0000000000..56aa151dae
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/data.md
@@ -0,0 +1,133 @@
+---
+sidebar_label: data
+title: automl.data
+---
+
+#### load\_openml\_dataset
+
+```python
+def load_openml_dataset(dataset_id, data_dir=None, random_state=0, dataset_format="dataframe")
+```
+
+Load dataset from open ML.
+
+If the file is not cached locally, download it from open ML.
+
+**Arguments**:
+
+- `dataset_id` - An integer of the dataset id in openml.
+- `data_dir` - A string of the path to store and load the data.
+- `random_state` - An integer of the random seed for splitting data.
+- `dataset_format` - A string specifying the format of returned dataset. Default is 'dataframe'.
+  Can choose from ['dataframe', 'array'].
+  If 'dataframe', the returned dataset will be a Pandas DataFrame.
+  If 'array', the returned dataset will be a NumPy array or a SciPy sparse matrix.
+  
+
+**Returns**:
+
+- `X_train` - Training data.
+- `X_test` - Test data.
+- `y_train` - A series or array of labels for training data.
+- `y_test` - A series or array of labels for test data.
+
+#### load\_openml\_task
+
+```python
+def load_openml_task(task_id, data_dir)
+```
+
+Load task from open ML.
+
+Use the first fold of the task.
+If the file is not cached locally, download it from open ML.
+
+**Arguments**:
+
+- `task_id` - An integer of the task id in openml.
+- `data_dir` - A string of the path to store and load the data.
+  
+
+**Returns**:
+
+- `X_train` - A dataframe of training data.
+- `X_test` - A dataframe of test data.
+- `y_train` - A series of labels for training data.
+- `y_test` - A series of labels for test data.
+
+#### get\_output\_from\_log
+
+```python
+def get_output_from_log(filename, time_budget)
+```
+
+Get output from log file.
+
+**Arguments**:
+
+- `filename` - A string of the log file name.
+- `time_budget` - A float of the time budget in seconds.
+  
+
+**Returns**:
+
+- `search_time_list` - A list of the finished time of each logged iter.
+- `best_error_list` - A list of the best validation error after each logged iter.
+- `error_list` - A list of the validation error of each logged iter.
+- `config_list` - A list of the estimator, sample size and config of each logged iter.
+- `logged_metric_list` - A list of the logged metric of each logged iter.
+
+#### concat
+
+```python
+def concat(X1, X2)
+```
+
+concatenate two matrices vertically.
+
+## DataTransformer Objects
+
+```python
+class DataTransformer()
+```
+
+Transform input training data.
+
+#### fit\_transform
+
+```python
+def fit_transform(X: Union[DataFrame, np.array], y, task)
+```
+
+Fit transformer and process the input training data according to the task type.
+
+**Arguments**:
+
+- `X` - A numpy array or a pandas dataframe of training data.
+- `y` - A numpy array or a pandas series of labels.
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast', 'rank'.
+  
+
+**Returns**:
+
+- `X` - Processed numpy array or pandas dataframe of training data.
+- `y` - Processed numpy array or pandas series of labels.
+
+#### transform
+
+```python
+def transform(X: Union[DataFrame, np.array])
+```
+
+Process data using fit transformer.
+
+**Arguments**:
+
+- `X` - A numpy array or a pandas dataframe of training data.
+  
+
+**Returns**:
+
+- `X` - Processed numpy array or pandas dataframe of training data.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/ml.md b/website/versioned_docs/version-1.0.4/reference/automl/ml.md
new file mode 100644
index 0000000000..1a5b2813ca
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/ml.md
@@ -0,0 +1,74 @@
+---
+sidebar_label: ml
+title: automl.ml
+---
+
+#### sklearn\_metric\_loss\_score
+
+```python
+def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None, sample_weight=None, groups=None)
+```
+
+Loss using the specified metric.
+
+**Arguments**:
+
+- `metric_name` - A string of the metric name, one of
+  'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
+  'roc_auc_ovo', 'roc_auc_weighted', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted',
+  'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'.
+- `y_predict` - A 1d or 2d numpy array of the predictions which can be
+  used to calculate the metric. E.g., 2d for log_loss and 1d
+  for others.
+- `y_true` - A 1d numpy array of the true labels.
+- `labels` - A list or an array of the unique labels.
+- `sample_weight` - A 1d numpy array of the sample weight.
+- `groups` - A 1d numpy array of the group labels.
+  
+
+**Returns**:
+
+- `score` - A float number of the loss, the lower the better.
+
+#### norm\_confusion\_matrix
+
+```python
+def norm_confusion_matrix(y_true, y_pred)
+```
+
+normalized confusion matrix.
+
+**Arguments**:
+
+- `estimator` - A multi-class classification estimator.
+- `y_true` - A numpy array or a pandas series of true labels.
+- `y_pred` - A numpy array or a pandas series of predicted labels.
+  
+
+**Returns**:
+
+  A normalized confusion matrix.
+
+#### multi\_class\_curves
+
+```python
+def multi_class_curves(y_true, y_pred_proba, curve_func)
+```
+
+Binarize the data for multi-class tasks and produce ROC or precision-recall curves.
+
+**Arguments**:
+
+- `y_true` - A numpy array or a pandas series of true labels.
+- `y_pred_proba` - A numpy array or a pandas dataframe of predicted probabilites.
+- `curve_func` - A function to produce a curve (e.g., roc_curve or precision_recall_curve).
+  
+
+**Returns**:
+
+  A tuple of two dictionaries with the same set of keys (class indices).
+  The first dictionary curve_x stores the x coordinates of each curve, e.g.,
+  curve_x[0] is an 1D array of the x coordinates of class 0.
+  The second dictionary curve_y stores the y coordinates of each curve, e.g.,
+  curve_y[0] is an 1D array of the y coordinates of class 0.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/model.md b/website/versioned_docs/version-1.0.4/reference/automl/model.md
new file mode 100644
index 0000000000..aaff470ebc
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/model.md
@@ -0,0 +1,420 @@
+---
+sidebar_label: model
+title: automl.model
+---
+
+## BaseEstimator Objects
+
+```python
+class BaseEstimator()
+```
+
+The abstract class for all learners.
+
+Typical examples:
+* XGBoostEstimator: for regression.
+* XGBoostSklearnEstimator: for classification.
+* LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
+    for both regression and classification.
+
+#### \_\_init\_\_
+
+```python
+def __init__(task="binary", **config)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `task` - A string of the task type, one of
+  'binary', 'multiclass', 'regression', 'rank', 'seq-classification',
+  'seq-regression', 'token-classification', 'multichoice-classification',
+  'summarization', 'ts_forecast', 'ts_forecast_classification'.
+- `config` - A dictionary containing the hyperparameter names, 'n_jobs' as keys.
+  n_jobs is the number of parallel threads.
+
+#### model
+
+```python
+@property
+def model()
+```
+
+Trained model after fit() is called, or None before fit() is called.
+
+#### estimator
+
+```python
+@property
+def estimator()
+```
+
+Trained model after fit() is called, or None before fit() is called.
+
+#### feature\_names\_in\_
+
+```python
+@property
+def feature_names_in_()
+```
+
+if self._model has attribute feature_names_in_, return it.
+otherwise, if self._model has attribute feature_name_, return it.
+otherwise, if self._model has attribute feature_names, return it.
+otherwise, if self._model has method get_booster, return the feature names.
+otherwise, return None.
+
+#### feature\_importances\_
+
+```python
+@property
+def feature_importances_()
+```
+
+if self._model has attribute feature_importances_, return it.
+otherwise, if self._model has attribute coef_, return it.
+otherwise, return None.
+
+#### fit
+
+```python
+def fit(X_train, y_train, budget=None, free_mem_ratio=0, **kwargs)
+```
+
+Train the model from given training data.
+
+**Arguments**:
+
+- `X_train` - A numpy array or a dataframe of training data in shape n*m.
+- `y_train` - A numpy array or a series of labels in shape n*1.
+- `budget` - A float of the time budget in seconds.
+- `free_mem_ratio` - A float between 0 and 1 for the free memory ratio to keep during training.
+  
+
+**Returns**:
+
+- `train_time` - A float of the training time in seconds.
+
+#### predict
+
+```python
+def predict(X, **kwargs)
+```
+
+Predict label from features.
+
+**Arguments**:
+
+- `X` - A numpy array or a dataframe of featurized instances, shape n*m.
+  
+
+**Returns**:
+
+  A numpy array of shape n*1.
+  Each element is the label for a instance.
+
+#### predict\_proba
+
+```python
+def predict_proba(X, **kwargs)
+```
+
+Predict the probability of each class from features.
+
+Only works for classification problems
+
+**Arguments**:
+
+- `X` - A numpy array of featurized instances, shape n*m.
+  
+
+**Returns**:
+
+  A numpy array of shape n*c. c is the # classes.
+  Each element at (i,j) is the probability for instance i to be in
+  class j.
+
+#### score
+
+```python
+def score(X_val: DataFrame, y_val: Series, **kwargs)
+```
+
+Report the evaluation score of a trained estimator.
+
+
+**Arguments**:
+
+- `X_val` - A pandas dataframe of the validation input data.
+- `y_val` - A pandas series of the validation label.
+- `kwargs` - keyword argument of the evaluation function, for example:
+  - metric: A string of the metric name or a function
+  e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
+  'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
+  'mape'. Default is 'auto'.
+  If metric is given, the score will report the user specified metric.
+  If metric is not given, the metric is set to accuracy for classification and r2
+  for regression.
+  You can also pass a customized metric function, for examples on how to pass a
+  customized metric function, please check
+  [test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and
+  [test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py).
+  
+
+**Returns**:
+
+  The evaluation score on the validation dataset.
+
+#### search\_space
+
+```python
+@classmethod
+def search_space(cls, data_size, task, **params)
+```
+
+[required method] search space.
+
+**Arguments**:
+
+- `data_size` - A tuple of two integers, number of rows and columns.
+- `task` - A str of the task type, e.g., "binary", "multiclass", "regression".
+  
+
+**Returns**:
+
+  A dictionary of the search space.
+  Each key is the name of a hyperparameter, and value is a dict with
+  its domain (required) and low_cost_init_value, init_value,
+  cat_hp_cost (if applicable).
+  e.g., ```{'domain': tune.randint(lower=1, upper=10), 'init_value': 1}```.
+
+#### size
+
+```python
+@classmethod
+def size(cls, config: dict) -> float
+```
+
+[optional method] memory size of the estimator in bytes.
+
+**Arguments**:
+
+- `config` - A dict of the hyperparameter config.
+  
+
+**Returns**:
+
+  A float of the memory size required by the estimator to train the
+  given config.
+
+#### cost\_relative2lgbm
+
+```python
+@classmethod
+def cost_relative2lgbm(cls) -> float
+```
+
+[optional method] relative cost compared to lightgbm.
+
+#### init
+
+```python
+@classmethod
+def init(cls)
+```
+
+[optional method] initialize the class.
+
+#### config2params
+
+```python
+def config2params(config: dict) -> dict
+```
+
+[optional method] config dict to params dict
+
+**Arguments**:
+
+- `config` - A dict of the hyperparameter config.
+  
+
+**Returns**:
+
+  A dict that will be passed to self.estimator_class's constructor.
+
+## TransformersEstimator Objects
+
+```python
+class TransformersEstimator(BaseEstimator)
+```
+
+The class for fine-tuning language models, using huggingface transformers API.
+
+## SKLearnEstimator Objects
+
+```python
+class SKLearnEstimator(BaseEstimator)
+```
+
+The base class for tuning scikit-learn estimators.
+
+Subclasses can modify the function signature of ``__init__`` to
+ignore the values in ``config`` that are not relevant to the constructor
+of their underlying estimator. For example, some regressors in ``scikit-learn``
+don't accept the ``n_jobs`` parameter contained in ``config``. For these,
+one can add ``n_jobs=None,`` before ``**config`` to make sure ``config`` doesn't
+contain an ``n_jobs`` key.
+
+## LGBMEstimator Objects
+
+```python
+class LGBMEstimator(BaseEstimator)
+```
+
+The class for tuning LGBM, using sklearn API.
+
+## XGBoostEstimator Objects
+
+```python
+class XGBoostEstimator(SKLearnEstimator)
+```
+
+The class for tuning XGBoost regressor, not using sklearn API.
+
+## XGBoostSklearnEstimator Objects
+
+```python
+class XGBoostSklearnEstimator(SKLearnEstimator,  LGBMEstimator)
+```
+
+The class for tuning XGBoost with unlimited depth, using sklearn API.
+
+## XGBoostLimitDepthEstimator Objects
+
+```python
+class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator)
+```
+
+The class for tuning XGBoost with limited depth, using sklearn API.
+
+## RandomForestEstimator Objects
+
+```python
+class RandomForestEstimator(SKLearnEstimator,  LGBMEstimator)
+```
+
+The class for tuning Random Forest.
+
+## ExtraTreesEstimator Objects
+
+```python
+class ExtraTreesEstimator(RandomForestEstimator)
+```
+
+The class for tuning Extra Trees.
+
+## LRL1Classifier Objects
+
+```python
+class LRL1Classifier(SKLearnEstimator)
+```
+
+The class for tuning Logistic Regression with L1 regularization.
+
+## LRL2Classifier Objects
+
+```python
+class LRL2Classifier(SKLearnEstimator)
+```
+
+The class for tuning Logistic Regression with L2 regularization.
+
+## CatBoostEstimator Objects
+
+```python
+class CatBoostEstimator(BaseEstimator)
+```
+
+The class for tuning CatBoost.
+
+## Prophet Objects
+
+```python
+class Prophet(SKLearnEstimator)
+```
+
+The class for tuning Prophet.
+
+## ARIMA Objects
+
+```python
+class ARIMA(Prophet)
+```
+
+The class for tuning ARIMA.
+
+## SARIMAX Objects
+
+```python
+class SARIMAX(ARIMA)
+```
+
+The class for tuning SARIMA.
+
+## TS\_SKLearn Objects
+
+```python
+class TS_SKLearn(SKLearnEstimator)
+```
+
+The class for tuning SKLearn Regressors for time-series forecasting, using hcrystalball
+
+## LGBM\_TS Objects
+
+```python
+class LGBM_TS(TS_SKLearn)
+```
+
+The class for tuning LGBM Regressor for time-series forecasting
+
+## XGBoost\_TS Objects
+
+```python
+class XGBoost_TS(TS_SKLearn)
+```
+
+The class for tuning XGBoost Regressor for time-series forecasting
+
+## RF\_TS Objects
+
+```python
+class RF_TS(TS_SKLearn)
+```
+
+The class for tuning Random Forest Regressor for time-series forecasting
+
+## ExtraTrees\_TS Objects
+
+```python
+class ExtraTrees_TS(TS_SKLearn)
+```
+
+The class for tuning Extra Trees Regressor for time-series forecasting
+
+## XGBoostLimitDepth\_TS Objects
+
+```python
+class XGBoostLimitDepth_TS(TS_SKLearn)
+```
+
+The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting
+
+## TemporalFusionTransformerEstimator Objects
+
+```python
+class TemporalFusionTransformerEstimator(SKLearnEstimator)
+```
+
+The class for tuning Temporal Fusion Transformer
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/trainer.md b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/trainer.md
new file mode 100644
index 0000000000..e29a5db94f
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/trainer.md
@@ -0,0 +1,19 @@
+---
+sidebar_label: trainer
+title: automl.nlp.huggingface.trainer
+---
+
+## TrainerForAuto Objects
+
+```python
+class TrainerForAuto(Seq2SeqTrainer)
+```
+
+#### evaluate
+
+```python
+def evaluate(eval_dataset=None, ignore_keys=None, metric_key_prefix="eval")
+```
+
+Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/training_args.md b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/training_args.md
new file mode 100644
index 0000000000..f8c30ba984
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/training_args.md
@@ -0,0 +1,34 @@
+---
+sidebar_label: training_args
+title: automl.nlp.huggingface.training_args
+---
+
+## TrainingArgumentsForAuto Objects
+
+```python
+@dataclass
+class TrainingArgumentsForAuto(TrainingArguments)
+```
+
+FLAML custom TrainingArguments.
+
+**Arguments**:
+
+- `task` _str_ - the task name for NLP tasks, e.g., seq-classification, token-classification
+- `output_dir` _str_ - data root directory for outputing the log, etc.
+- `model_path` _str, optional, defaults to "facebook/muppet-roberta-base"_ - A string,
+  the path of the language model file, either a path from huggingface
+  model card huggingface.co/models, or a local path for the model.
+- `fp16` _bool, optional, defaults to "False"_ - A bool, whether to use FP16.
+- `max_seq_length` _int, optional, defaults to 128_ - An integer, the max length of the sequence.
+  For token classification task, this argument will be ineffective.
+  pad_to_max_length (bool, optional, defaults to "False"):
+  whether to pad all samples to model maximum sentence length.
+  If False, will pad the samples dynamically when batching to the maximum length in the batch.
+- `per_device_eval_batch_size` _int, optional, defaults to 1_ - An integer, the per gpu evaluation batch size.
+- `label_list` _List[str], optional, defaults to None_ - A list of string, the string list of the label names.
+  When the task is sequence labeling/token classification, there are two formats of the labels:
+  (1) The token labels, i.e., [B-PER, I-PER, B-LOC]; (2) Id labels. For (2), need to pass the label_list (e.g., [B-PER, I-PER, B-LOC])
+  to convert the Id to token labels when computing the metric with metric_loss_score.
+  See the example in [a simple token classification example](../../../../Examples/AutoML-NLP#a-simple-token-classification-example).
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/utils.md b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/utils.md
new file mode 100644
index 0000000000..ca25deb356
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/nlp/huggingface/utils.md
@@ -0,0 +1,13 @@
+---
+sidebar_label: utils
+title: automl.nlp.huggingface.utils
+---
+
+#### todf
+
+```python
+def todf(X, Y, column_name)
+```
+
+todf converts Y from any format (list, pandas.Series, numpy array) to a DataFrame before being returned
+
diff --git a/website/versioned_docs/version-1.0.4/reference/automl/nlp/utils.md b/website/versioned_docs/version-1.0.4/reference/automl/nlp/utils.md
new file mode 100644
index 0000000000..74b42f6bdd
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/automl/nlp/utils.md
@@ -0,0 +1,13 @@
+---
+sidebar_label: utils
+title: automl.nlp.utils
+---
+
+#### format\_vars
+
+```python
+def format_vars(resolved_vars: Dict) -> str
+```
+
+Formats the resolved variable dict into a single string.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/default/estimator.md b/website/versioned_docs/version-1.0.4/reference/default/estimator.md
new file mode 100644
index 0000000000..1409eb35a0
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/default/estimator.md
@@ -0,0 +1,33 @@
+---
+sidebar_label: estimator
+title: default.estimator
+---
+
+#### flamlize\_estimator
+
+```python
+def flamlize_estimator(super_class, name: str, task: str, alternatives=None)
+```
+
+Enhance an estimator class with flaml's data-dependent default hyperparameter settings.
+
+**Example**:
+
+  
+```python
+import sklearn.ensemble as ensemble
+RandomForestRegressor = flamlize_estimator(
+    ensemble.RandomForestRegressor, "rf", "regression"
+)
+```
+  
+
+**Arguments**:
+
+- `super_class` - an scikit-learn compatible estimator class.
+- `name` - a str of the estimator's name.
+- `task` - a str of the task type.
+- `alternatives` - (Optional) a list for alternative estimator names. For example,
+  ```[("max_depth", 0, "xgboost")]``` means if the "max_depth" is set to 0
+  in the constructor, then look for the learned defaults for estimator "xgboost".
+
diff --git a/website/versioned_docs/version-1.0.4/reference/default/greedy.md b/website/versioned_docs/version-1.0.4/reference/default/greedy.md
new file mode 100644
index 0000000000..132fa574b0
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/default/greedy.md
@@ -0,0 +1,29 @@
+---
+sidebar_label: greedy
+title: default.greedy
+---
+
+#### construct\_portfolio
+
+```python
+def construct_portfolio(regret_matrix, meta_features, regret_bound)
+```
+
+The portfolio construction algorithm.
+
+(Reference)[https://arxiv.org/abs/2202.09927].
+
+**Arguments**:
+
+- `regret_matrix` - A dataframe of regret matrix.
+- `meta_features` - None or a dataframe of metafeatures matrix.
+  When set to None, the algorithm uses greedy strategy.
+  Otherwise, the algorithm uses greedy strategy with feedback
+  from the nearest neighbor predictor.
+- `regret_bound` - A float of the regret bound.
+  
+
+**Returns**:
+
+  A list of configuration names.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/default/portfolio.md b/website/versioned_docs/version-1.0.4/reference/default/portfolio.md
new file mode 100644
index 0000000000..610610b6d9
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/default/portfolio.md
@@ -0,0 +1,57 @@
+---
+sidebar_label: portfolio
+title: default.portfolio
+---
+
+#### config\_predictor\_tuple
+
+```python
+def config_predictor_tuple(tasks, configs, meta_features, regret_matrix)
+```
+
+Config predictor represented in tuple.
+
+The returned tuple consists of (meta_features, preferences, proc).
+
+**Returns**:
+
+- `meta_features_norm` - A dataframe of normalized meta features, each column for a task.
+- `preferences` - A dataframe of sorted configuration indicies by their performance per task (column).
+- `regret_matrix` - A dataframe of the configuration(row)-task(column) regret matrix.
+
+#### build\_portfolio
+
+```python
+def build_portfolio(meta_features, regret, strategy)
+```
+
+Build a portfolio from meta features and regret matrix.
+
+**Arguments**:
+
+- `meta_features` - A dataframe of metafeatures matrix.
+- `regret` - A dataframe of regret matrix.
+- `strategy` - A str of the strategy, one of ("greedy", "greedy-feedback").
+
+#### load\_json
+
+```python
+def load_json(filename)
+```
+
+Returns the contents of json file filename.
+
+#### serialize
+
+```python
+def serialize(configs, regret, meta_features, output_file, config_path)
+```
+
+Store to disk all information FLAML-metalearn needs at runtime.
+
+configs: names of model configs
+regret: regret matrix
+meta_features: task metafeatures
+output_file: filename
+config_path: path containing config json files
+
diff --git a/website/versioned_docs/version-1.0.4/reference/default/suggest.md b/website/versioned_docs/version-1.0.4/reference/default/suggest.md
new file mode 100644
index 0000000000..445faac5b4
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/default/suggest.md
@@ -0,0 +1,134 @@
+---
+sidebar_label: suggest
+title: default.suggest
+---
+
+#### suggest\_config
+
+```python
+def suggest_config(task, X, y, estimator_or_predictor, location=None, k=None)
+```
+
+Suggest a list of configs for the given task and training data.
+
+The returned configs can be used as starting points for AutoML.fit().
+`FLAML_sample_size` is removed from the configs.
+
+#### suggest\_learner
+
+```python
+def suggest_learner(task, X, y, estimator_or_predictor="all", estimator_list=None, location=None)
+```
+
+Suggest best learner within estimator_list.
+
+#### suggest\_hyperparams
+
+```python
+def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None)
+```
+
+Suggest hyperparameter configurations and an estimator class.
+
+The configurations can be used to initialize the estimator class like lightgbm.LGBMRegressor.
+
+**Example**:
+
+  
+```python
+hyperparams, estimator_class = suggest_hyperparams("regression", X_train, y_train, "lgbm")
+model = estimator_class(**hyperparams)  # estimator_class is LGBMRegressor
+model.fit(X_train, y_train)
+```
+  
+
+**Arguments**:
+
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast', 'rank',
+  'seq-classification', 'seq-regression'.
+- `X` - A dataframe of training data in shape n*m.
+  For 'ts_forecast' task, the first column of X_train
+  must be the timestamp column (datetime type). Other
+  columns in the dataframe are assumed to be exogenous
+  variables (categorical or numeric).
+- `y` - A series of labels in shape n*1.
+- `estimator_or_predictor` - A str of the learner name or a dict of the learned config predictor.
+  If a dict, it contains:
+  - "version": a str of the version number.
+  - "preprocessing": a dictionary containing:
+  * "center": a list of meta feature value offsets for normalization.
+  * "scale": a list of meta feature scales to normalize each dimension.
+  - "neighbors": a list of dictionaries. Each dictionary contains:
+  * "features": a list of the normalized meta features for a neighbor.
+  * "choice": an integer of the configuration id in the portfolio.
+  - "portfolio": a list of dictionaries, each corresponding to a configuration:
+  * "class": a str of the learner name.
+  * "hyperparameters": a dict of the config. The key "FLAML_sample_size" will be ignored.
+- `location` - (Optional) A str of the location containing mined portfolio file.
+  Only valid when the portfolio is a str, by default the location is flaml/default.
+  
+
+**Returns**:
+
+- `hyperparams` - A dict of the hyperparameter configurations.
+- `estiamtor_class` - A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
+
+#### preprocess\_and\_suggest\_hyperparams
+
+```python
+def preprocess_and_suggest_hyperparams(task, X, y, estimator_or_predictor, location=None)
+```
+
+Preprocess the data and suggest hyperparameters.
+
+**Example**:
+
+  
+```python
+hyperparams, estimator_class, X, y, feature_transformer, label_transformer = \
+    preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth")
+model = estimator_class(**hyperparams)  # estimator_class is XGBClassifier
+model.fit(X, y)
+X_test = feature_transformer.transform(X_test)
+y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
+```
+  
+
+**Arguments**:
+
+- `task` - A string of the task type, e.g.,
+  'classification', 'regression', 'ts_forecast', 'rank',
+  'seq-classification', 'seq-regression'.
+- `X` - A dataframe of training data in shape n*m.
+  For 'ts_forecast' task, the first column of X_train
+  must be the timestamp column (datetime type). Other
+  columns in the dataframe are assumed to be exogenous
+  variables (categorical or numeric).
+- `y` - A series of labels in shape n*1.
+- `estimator_or_predictor` - A str of the learner name or a dict of the learned config predictor.
+  "choose_xgb" means choosing between xgb_limitdepth and xgboost.
+  If a dict, it contains:
+  - "version": a str of the version number.
+  - "preprocessing": a dictionary containing:
+  * "center": a list of meta feature value offsets for normalization.
+  * "scale": a list of meta feature scales to normalize each dimension.
+  - "neighbors": a list of dictionaries. Each dictionary contains:
+  * "features": a list of the normalized meta features for a neighbor.
+  * "choice": a integer of the configuration id in the portfolio.
+  - "portfolio": a list of dictionaries, each corresponding to a configuration:
+  * "class": a str of the learner name.
+  * "hyperparameters": a dict of the config. They key "FLAML_sample_size" will be ignored.
+- `location` - (Optional) A str of the location containing mined portfolio file.
+  Only valid when the portfolio is a str, by default the location is flaml/default.
+  
+
+**Returns**:
+
+- `hyperparams` - A dict of the hyperparameter configurations.
+- `estiamtor_class` - A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
+- `X` - the preprocessed X.
+- `y` - the preprocessed y.
+- `feature_transformer` - a data transformer that can be applied to X_test.
+- `label_transformer` - a label transformer that can be applied to y_test.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/onlineml/autovw.md b/website/versioned_docs/version-1.0.4/reference/onlineml/autovw.md
new file mode 100644
index 0000000000..d7c9e2dc52
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/onlineml/autovw.md
@@ -0,0 +1,110 @@
+---
+sidebar_label: autovw
+title: onlineml.autovw
+---
+
+## AutoVW Objects
+
+```python
+class AutoVW()
+```
+
+Class for the AutoVW algorithm.
+
+#### \_\_init\_\_
+
+```python
+def __init__(max_live_model_num: int, search_space: dict, init_config: Optional[dict] = {}, min_resource_lease: Optional[Union[str, float]] = "auto", automl_runner_args: Optional[dict] = {}, scheduler_args: Optional[dict] = {}, model_select_policy: Optional[str] = "threshold_loss_ucb", metric: Optional[str] = "mae_clipped", random_seed: Optional[int] = None, model_selection_mode: Optional[str] = "min", cb_coef: Optional[float] = None)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `max_live_model_num` - An int to specify the maximum number of
+  'live' models, which, in other words, is the maximum number
+  of models allowed to update in each learning iteraction.
+- `search_space` - A dictionary of the search space. This search space
+  includes both hyperparameters we want to tune and fixed
+  hyperparameters. In the latter case, the value is a fixed value.
+- `init_config` - A dictionary of a partial or full initial config,
+  e.g. {'interactions': set(), 'learning_rate': 0.5}
+- `min_resource_lease` - string or float | The minimum resource lease
+  assigned to a particular model/trial. If set as 'auto', it will
+  be calculated automatically.
+- `automl_runner_args` - A dictionary of configuration for the OnlineTrialRunner.
+  If set {}, default values will be used, which is equivalent to using
+  the following configs.
+
+**Example**:
+
+  
+```python
+automl_runner_args = {
+    "champion_test_policy": 'loss_ucb', # the statistic test for a better champion
+    "remove_worse": False,              # whether to do worse than test
+}
+```
+  
+- `scheduler_args` - A dictionary of configuration for the scheduler.
+  If set {}, default values will be used, which is equivalent to using the
+  following config.
+
+**Example**:
+
+  
+```python
+scheduler_args = {
+    "keep_challenger_metric": 'ucb',  # what metric to use when deciding the top performing challengers
+    "keep_challenger_ratio": 0.5,     # denotes the ratio of top performing challengers to keep live
+    "keep_champion": True,            # specifcies whether to keep the champion always running
+}
+```
+  
+- `model_select_policy` - A string in ['threshold_loss_ucb',
+  'threshold_loss_lcb', 'threshold_loss_avg', 'loss_ucb', 'loss_lcb',
+  'loss_avg'] to specify how to select one model to do prediction from
+  the live model pool. Default value is 'threshold_loss_ucb'.
+- `metric` - A string in ['mae_clipped', 'mae', 'mse', 'absolute_clipped',
+  'absolute', 'squared'] to specify the name of the loss function used
+  for calculating the progressive validation loss in ChaCha.
+- `random_seed` - An integer of the random seed used in the searcher
+  (more specifically this the random seed for ConfigOracle).
+- `model_selection_mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `cb_coef` - A float coefficient (optional) used in the sample complexity bound.
+
+#### predict
+
+```python
+def predict(data_sample)
+```
+
+Predict on the input data sample.
+
+**Arguments**:
+
+- `data_sample` - one data example in vw format.
+
+#### learn
+
+```python
+def learn(data_sample)
+```
+
+Perform one online learning step with the given data sample.
+
+**Arguments**:
+
+- `data_sample` - one data example in vw format. It will be used to
+  update the vw model.
+
+#### get\_ns\_feature\_dim\_from\_vw\_example
+
+```python
+@staticmethod
+def get_ns_feature_dim_from_vw_example(vw_example) -> dict
+```
+
+Get a dictionary of feature dimensionality for each namespace singleton.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/onlineml/trial.md b/website/versioned_docs/version-1.0.4/reference/onlineml/trial.md
new file mode 100644
index 0000000000..d8443686d0
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/onlineml/trial.md
@@ -0,0 +1,133 @@
+---
+sidebar_label: trial
+title: onlineml.trial
+---
+
+#### get\_ns\_feature\_dim\_from\_vw\_example
+
+```python
+def get_ns_feature_dim_from_vw_example(vw_example) -> dict
+```
+
+Get a dictionary of feature dimensionality for each namespace singleton.
+
+## OnlineResult Objects
+
+```python
+class OnlineResult()
+```
+
+Class for managing the result statistics of a trial.
+
+#### \_\_init\_\_
+
+```python
+def __init__(result_type_name: str, cb_coef: Optional[float] = None, init_loss: Optional[float] = 0.0, init_cb: Optional[float] = 100.0, mode: Optional[str] = "min", sliding_window_size: Optional[int] = 100)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `result_type_name` - A String to specify the name of the result type.
+- `cb_coef` - a string to specify the coefficient on the confidence bound.
+- `init_loss` - a float to specify the inital loss.
+- `init_cb` - a float to specify the intial confidence bound.
+- `mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `sliding_window_size` - An int to specify the size of the sliding windown
+  (for experimental purpose).
+
+#### update\_result
+
+```python
+def update_result(new_loss, new_resource_used, data_dimension, bound_of_range=1.0, new_observation_count=1.0)
+```
+
+Update result statistics.
+
+## BaseOnlineTrial Objects
+
+```python
+class BaseOnlineTrial(Trial)
+```
+
+Class for the online trial.
+
+#### \_\_init\_\_
+
+```python
+def __init__(config: dict, min_resource_lease: float, is_champion: Optional[bool] = False, is_checked_under_current_champion: Optional[bool] = True, custom_trial_name: Optional[str] = "mae", trial_id: Optional[str] = None)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `config` - The configuration dictionary.
+- `min_resource_lease` - A float specifying the minimum resource lease.
+- `is_champion` - A bool variable indicating whether the trial is champion.
+- `is_checked_under_current_champion` - A bool indicating whether the trial
+  has been used under the current champion.
+- `custom_trial_name` - A string of a custom trial name.
+- `trial_id` - A string for the trial id.
+
+#### set\_resource\_lease
+
+```python
+def set_resource_lease(resource: float)
+```
+
+Sets the resource lease accordingly.
+
+#### set\_status
+
+```python
+def set_status(status)
+```
+
+Sets the status of the trial and record the start time.
+
+## VowpalWabbitTrial Objects
+
+```python
+class VowpalWabbitTrial(BaseOnlineTrial)
+```
+
+The class for Vowpal Wabbit online trials.
+
+#### \_\_init\_\_
+
+```python
+def __init__(config: dict, min_resource_lease: float, metric: str = "mae", is_champion: Optional[bool] = False, is_checked_under_current_champion: Optional[bool] = True, custom_trial_name: Optional[str] = "vw_mae_clipped", trial_id: Optional[str] = None, cb_coef: Optional[float] = None)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `config` _dict_ - the config of the trial (note that the config is a set
+  because the hyperparameters are).
+- `min_resource_lease` _float_ - the minimum resource lease.
+- `metric` _str_ - the loss metric.
+- `is_champion` _bool_ - indicates whether the trial is the current champion or not.
+- `is_checked_under_current_champion` _bool_ - indicates whether this trials has
+  been paused under the current champion.
+- `trial_id` _str_ - id of the trial (if None, it will be generated in the constructor).
+
+#### train\_eval\_model\_online
+
+```python
+def train_eval_model_online(data_sample, y_pred)
+```
+
+Train and evaluate model online.
+
+#### predict
+
+```python
+def predict(x)
+```
+
+Predict using the model.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/onlineml/trial_runner.md b/website/versioned_docs/version-1.0.4/reference/onlineml/trial_runner.md
new file mode 100644
index 0000000000..b3b7e0c0b3
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/onlineml/trial_runner.md
@@ -0,0 +1,119 @@
+---
+sidebar_label: trial_runner
+title: onlineml.trial_runner
+---
+
+## OnlineTrialRunner Objects
+
+```python
+class OnlineTrialRunner()
+```
+
+Class for the OnlineTrialRunner.
+
+#### \_\_init\_\_
+
+```python
+def __init__(max_live_model_num: int, searcher=None, scheduler=None, champion_test_policy="loss_ucb", **kwargs)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `max_live_model_num` - The maximum number of 'live'/running models allowed.
+- `searcher` - A class for generating Trial objects progressively.
+  The ConfigOracle is implemented in the searcher.
+- `scheduler` - A class for managing the 'live' trials and allocating the
+  resources for the trials.
+- `champion_test_policy` - A string to specify what test policy to test for
+  champion. Currently can choose from ['loss_ucb', 'loss_avg', 'loss_lcb', None].
+
+#### champion\_trial
+
+```python
+@property
+def champion_trial() -> Trial
+```
+
+The champion trial.
+
+#### running\_trials
+
+```python
+@property
+def running_trials()
+```
+
+The running/'live' trials.
+
+#### step
+
+```python
+def step(data_sample=None, prediction_trial_tuple=None)
+```
+
+Schedule one trial to run each time it is called.
+
+**Arguments**:
+
+- `data_sample` - One data example.
+- `prediction_trial_tuple` - A list of information containing
+  (prediction_made, prediction_trial).
+
+#### get\_top\_running\_trials
+
+```python
+def get_top_running_trials(top_ratio=None, top_metric="ucb") -> list
+```
+
+Get a list of trial ids, whose performance is among the top running trials.
+
+#### get\_trials
+
+```python
+def get_trials() -> list
+```
+
+Return the list of trials managed by this TrialRunner.
+
+#### add\_trial
+
+```python
+def add_trial(new_trial)
+```
+
+Add a new trial to this TrialRunner.
+Trials may be added at any time.
+
+**Arguments**:
+
+- `new_trial` _Trial_ - Trial to queue.
+
+#### stop\_trial
+
+```python
+def stop_trial(trial)
+```
+
+Stop a trial: set the status of a trial to be
+Trial.TERMINATED and perform other subsequent operations.
+
+#### pause\_trial
+
+```python
+def pause_trial(trial)
+```
+
+Pause a trial: set the status of a trial to be Trial.PAUSED
+and perform other subsequent operations.
+
+#### run\_trial
+
+```python
+def run_trial(trial)
+```
+
+Run a trial: set the status of a trial to be Trial.RUNNING
+and perform other subsequent operations.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/sidebar.json b/website/versioned_docs/version-1.0.4/reference/sidebar.json
new file mode 100644
index 0000000000..d0cbe17c41
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/sidebar.json
@@ -0,0 +1,85 @@
+{
+  "items": [
+    {
+      "items": [
+        {
+          "items": [
+            {
+              "items": [
+                "reference/automl/nlp/huggingface/trainer",
+                "reference/automl/nlp/huggingface/training_args",
+                "reference/automl/nlp/huggingface/utils"
+              ],
+              "label": "automl.nlp.huggingface",
+              "type": "category"
+            },
+            "reference/automl/nlp/utils"
+          ],
+          "label": "automl.nlp",
+          "type": "category"
+        },
+        "reference/automl/automl",
+        "reference/automl/data",
+        "reference/automl/ml",
+        "reference/automl/model"
+      ],
+      "label": "automl",
+      "type": "category"
+    },
+    {
+      "items": [
+        "reference/default/estimator",
+        "reference/default/greedy",
+        "reference/default/portfolio",
+        "reference/default/suggest"
+      ],
+      "label": "default",
+      "type": "category"
+    },
+    {
+      "items": [
+        "reference/onlineml/autovw",
+        "reference/onlineml/trial",
+        "reference/onlineml/trial_runner"
+      ],
+      "label": "onlineml",
+      "type": "category"
+    },
+    {
+      "items": [
+        {
+          "items": [
+            "reference/tune/scheduler/online_scheduler",
+            "reference/tune/scheduler/trial_scheduler"
+          ],
+          "label": "tune.scheduler",
+          "type": "category"
+        },
+        {
+          "items": [
+            "reference/tune/searcher/blendsearch",
+            "reference/tune/searcher/cfo_cat",
+            "reference/tune/searcher/flow2",
+            "reference/tune/searcher/online_searcher",
+            "reference/tune/searcher/search_thread",
+            "reference/tune/searcher/suggestion",
+            "reference/tune/searcher/variant_generator"
+          ],
+          "label": "tune.searcher",
+          "type": "category"
+        },
+        "reference/tune/analysis",
+        "reference/tune/sample",
+        "reference/tune/space",
+        "reference/tune/trial",
+        "reference/tune/trial_runner",
+        "reference/tune/tune",
+        "reference/tune/utils"
+      ],
+      "label": "tune",
+      "type": "category"
+    }
+  ],
+  "label": "Reference",
+  "type": "category"
+}
\ No newline at end of file
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/analysis.md b/website/versioned_docs/version-1.0.4/reference/tune/analysis.md
new file mode 100644
index 0000000000..3120d9bbb9
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/analysis.md
@@ -0,0 +1,122 @@
+---
+sidebar_label: analysis
+title: tune.analysis
+---
+
+## ExperimentAnalysis Objects
+
+```python
+class ExperimentAnalysis()
+```
+
+Analyze results from a Tune experiment.
+
+#### best\_trial
+
+```python
+@property
+def best_trial() -> Trial
+```
+
+Get the best trial of the experiment
+The best trial is determined by comparing the last trial results
+using the `metric` and `mode` parameters passed to `tune.run()`.
+If you didn't pass these parameters, use
+`get_best_trial(metric, mode, scope)` instead.
+
+#### best\_config
+
+```python
+@property
+def best_config() -> Dict
+```
+
+Get the config of the best trial of the experiment
+The best trial is determined by comparing the last trial results
+using the `metric` and `mode` parameters passed to `tune.run()`.
+If you didn't pass these parameters, use
+`get_best_config(metric, mode, scope)` instead.
+
+#### results
+
+```python
+@property
+def results() -> Dict[str, Dict]
+```
+
+Get the last result of all the trials of the experiment
+
+#### get\_best\_trial
+
+```python
+def get_best_trial(metric: Optional[str] = None, mode: Optional[str] = None, scope: str = "last", filter_nan_and_inf: bool = True) -> Optional[Trial]
+```
+
+Retrieve the best trial object.
+Compares all trials' scores on ``metric``.
+If ``metric`` is not specified, ``self.default_metric`` will be used.
+If `mode` is not specified, ``self.default_mode`` will be used.
+These values are usually initialized by passing the ``metric`` and
+``mode`` parameters to ``tune.run()``.
+
+**Arguments**:
+
+- `metric` _str_ - Key for trial info to order on. Defaults to
+  ``self.default_metric``.
+- `mode` _str_ - One of [min, max]. Defaults to ``self.default_mode``.
+- `scope` _str_ - One of [all, last, avg, last-5-avg, last-10-avg].
+  If `scope=last`, only look at each trial's final step for
+  `metric`, and compare across trials based on `mode=[min,max]`.
+  If `scope=avg`, consider the simple average over all steps
+  for `metric` and compare across trials based on
+  `mode=[min,max]`. If `scope=last-5-avg` or `scope=last-10-avg`,
+  consider the simple average over the last 5 or 10 steps for
+  `metric` and compare across trials based on `mode=[min,max]`.
+  If `scope=all`, find each trial's min/max score for `metric`
+  based on `mode`, and compare trials based on `mode=[min,max]`.
+- `filter_nan_and_inf` _bool_ - If True (default), NaN or infinite
+  values are disregarded and these trials are never selected as
+  the best trial.
+
+#### get\_best\_config
+
+```python
+def get_best_config(metric: Optional[str] = None, mode: Optional[str] = None, scope: str = "last") -> Optional[Dict]
+```
+
+Retrieve the best config corresponding to the trial.
+Compares all trials' scores on `metric`.
+If ``metric`` is not specified, ``self.default_metric`` will be used.
+If `mode` is not specified, ``self.default_mode`` will be used.
+These values are usually initialized by passing the ``metric`` and
+``mode`` parameters to ``tune.run()``.
+
+**Arguments**:
+
+- `metric` _str_ - Key for trial info to order on. Defaults to
+  ``self.default_metric``.
+- `mode` _str_ - One of [min, max]. Defaults to ``self.default_mode``.
+- `scope` _str_ - One of [all, last, avg, last-5-avg, last-10-avg].
+  If `scope=last`, only look at each trial's final step for
+  `metric`, and compare across trials based on `mode=[min,max]`.
+  If `scope=avg`, consider the simple average over all steps
+  for `metric` and compare across trials based on
+  `mode=[min,max]`. If `scope=last-5-avg` or `scope=last-10-avg`,
+  consider the simple average over the last 5 or 10 steps for
+  `metric` and compare across trials based on `mode=[min,max]`.
+  If `scope=all`, find each trial's min/max score for `metric`
+  based on `mode`, and compare trials based on `mode=[min,max]`.
+
+#### best\_result
+
+```python
+@property
+def best_result() -> Dict
+```
+
+Get the last result of the best trial of the experiment
+The best trial is determined by comparing the last trial results
+using the `metric` and `mode` parameters passed to `tune.run()`.
+If you didn't pass these parameters, use
+`get_best_trial(metric, mode, scope).last_result` instead.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/sample.md b/website/versioned_docs/version-1.0.4/reference/tune/sample.md
new file mode 100644
index 0000000000..460cc91b24
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/sample.md
@@ -0,0 +1,183 @@
+---
+sidebar_label: sample
+title: tune.sample
+---
+
+## Domain Objects
+
+```python
+class Domain()
+```
+
+Base class to specify a type and valid range to sample parameters from.
+This base class is implemented by parameter spaces, like float ranges
+(``Float``), integer ranges (``Integer``), or categorical variables
+(``Categorical``). The ``Domain`` object contains information about
+valid values (e.g. minimum and maximum values), and exposes methods that
+allow specification of specific samplers (e.g. ``uniform()`` or
+``loguniform()``).
+
+#### cast
+
+```python
+def cast(value)
+```
+
+Cast value to domain type
+
+#### is\_valid
+
+```python
+def is_valid(value: Any)
+```
+
+Returns True if `value` is a valid value in this domain.
+
+## Grid Objects
+
+```python
+class Grid(Sampler)
+```
+
+Dummy sampler used for grid search
+
+#### uniform
+
+```python
+def uniform(lower: float, upper: float)
+```
+
+Sample a float value uniformly between ``lower`` and ``upper``.
+Sampling from ``tune.uniform(1, 10)`` is equivalent to sampling from
+``np.random.uniform(1, 10))``
+
+#### quniform
+
+```python
+def quniform(lower: float, upper: float, q: float)
+```
+
+Sample a quantized float value uniformly between ``lower`` and ``upper``.
+Sampling from ``tune.uniform(1, 10)`` is equivalent to sampling from
+``np.random.uniform(1, 10))``
+The value will be quantized, i.e. rounded to an integer increment of ``q``.
+Quantization makes the upper bound inclusive.
+
+#### loguniform
+
+```python
+def loguniform(lower: float, upper: float, base: float = 10)
+```
+
+Sugar for sampling in different orders of magnitude.
+
+**Arguments**:
+
+- `lower` _float_ - Lower boundary of the output interval (e.g. 1e-4)
+- `upper` _float_ - Upper boundary of the output interval (e.g. 1e-2)
+- `base` _int_ - Base of the log. Defaults to 10.
+
+#### qloguniform
+
+```python
+def qloguniform(lower: float, upper: float, q: float, base: float = 10)
+```
+
+Sugar for sampling in different orders of magnitude.
+The value will be quantized, i.e. rounded to an integer increment of ``q``.
+Quantization makes the upper bound inclusive.
+
+**Arguments**:
+
+- `lower` _float_ - Lower boundary of the output interval (e.g. 1e-4)
+- `upper` _float_ - Upper boundary of the output interval (e.g. 1e-2)
+- `q` _float_ - Quantization number. The result will be rounded to an
+  integer increment of this value.
+- `base` _int_ - Base of the log. Defaults to 10.
+
+#### choice
+
+```python
+def choice(categories: Sequence)
+```
+
+Sample a categorical value.
+Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from
+``np.random.choice([1, 2])``
+
+#### randint
+
+```python
+def randint(lower: int, upper: int)
+```
+
+Sample an integer value uniformly between ``lower`` and ``upper``.
+``lower`` is inclusive, ``upper`` is exclusive.
+Sampling from ``tune.randint(10)`` is equivalent to sampling from
+``np.random.randint(10)``
+
+#### lograndint
+
+```python
+def lograndint(lower: int, upper: int, base: float = 10)
+```
+
+Sample an integer value log-uniformly between ``lower`` and ``upper``,
+with ``base`` being the base of logarithm.
+``lower`` is inclusive, ``upper`` is exclusive.
+
+#### qrandint
+
+```python
+def qrandint(lower: int, upper: int, q: int = 1)
+```
+
+Sample an integer value uniformly between ``lower`` and ``upper``.
+
+``lower`` is inclusive, ``upper`` is also inclusive (!).
+
+The value will be quantized, i.e. rounded to an integer increment of ``q``.
+Quantization makes the upper bound inclusive.
+
+#### qlograndint
+
+```python
+def qlograndint(lower: int, upper: int, q: int, base: float = 10)
+```
+
+Sample an integer value log-uniformly between ``lower`` and ``upper``,
+with ``base`` being the base of logarithm.
+``lower`` is inclusive, ``upper`` is also inclusive (!).
+The value will be quantized, i.e. rounded to an integer increment of ``q``.
+Quantization makes the upper bound inclusive.
+
+#### randn
+
+```python
+def randn(mean: float = 0.0, sd: float = 1.0)
+```
+
+Sample a float value normally with ``mean`` and ``sd``.
+
+**Arguments**:
+
+- `mean` _float_ - Mean of the normal distribution. Defaults to 0.
+- `sd` _float_ - SD of the normal distribution. Defaults to 1.
+
+#### qrandn
+
+```python
+def qrandn(mean: float, sd: float, q: float)
+```
+
+Sample a float value normally with ``mean`` and ``sd``.
+
+The value will be quantized, i.e. rounded to an integer increment of ``q``.
+
+**Arguments**:
+
+- `mean` - Mean of the normal distribution.
+- `sd` - SD of the normal distribution.
+- `q` - Quantization number. The result will be rounded to an
+  integer increment of this value.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/scheduler/online_scheduler.md b/website/versioned_docs/version-1.0.4/reference/tune/scheduler/online_scheduler.md
new file mode 100644
index 0000000000..05eba1189d
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/scheduler/online_scheduler.md
@@ -0,0 +1,87 @@
+---
+sidebar_label: online_scheduler
+title: tune.scheduler.online_scheduler
+---
+
+## OnlineScheduler Objects
+
+```python
+class OnlineScheduler(TrialScheduler)
+```
+
+Class for the most basic OnlineScheduler.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_runner, trial: Trial, result: Dict)
+```
+
+Report result and return a decision on the trial's status.
+
+#### choose\_trial\_to\_run
+
+```python
+def choose_trial_to_run(trial_runner) -> Trial
+```
+
+Decide which trial to run next.
+
+## OnlineSuccessiveDoublingScheduler Objects
+
+```python
+class OnlineSuccessiveDoublingScheduler(OnlineScheduler)
+```
+
+class for the OnlineSuccessiveDoublingScheduler algorithm.
+
+#### \_\_init\_\_
+
+```python
+def __init__(increase_factor: float = 2.0)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `increase_factor` - A float of multiplicative factor
+  used to increase resource lease. Default is 2.0.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_runner, trial: Trial, result: Dict)
+```
+
+Report result and return a decision on the trial's status.
+
+## ChaChaScheduler Objects
+
+```python
+class ChaChaScheduler(OnlineSuccessiveDoublingScheduler)
+```
+
+class for the ChaChaScheduler algorithm.
+
+#### \_\_init\_\_
+
+```python
+def __init__(increase_factor: float = 2.0, **kwargs)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `increase_factor` - A float of multiplicative factor
+  used to increase resource lease. Default is 2.0.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_runner, trial: Trial, result: Dict)
+```
+
+Report result and return a decision on the trial's status.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/scheduler/trial_scheduler.md b/website/versioned_docs/version-1.0.4/reference/tune/scheduler/trial_scheduler.md
new file mode 100644
index 0000000000..79ffe51e35
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/scheduler/trial_scheduler.md
@@ -0,0 +1,13 @@
+---
+sidebar_label: trial_scheduler
+title: tune.scheduler.trial_scheduler
+---
+
+## TrialScheduler Objects
+
+```python
+class TrialScheduler()
+```
+
+Interface for implementing a Trial Scheduler class.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/blendsearch.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/blendsearch.md
new file mode 100644
index 0000000000..40e71b61ef
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/blendsearch.md
@@ -0,0 +1,210 @@
+---
+sidebar_label: blendsearch
+title: tune.searcher.blendsearch
+---
+
+## BlendSearch Objects
+
+```python
+class BlendSearch(Searcher)
+```
+
+class for BlendSearch algorithm.
+
+#### \_\_init\_\_
+
+```python
+def __init__(metric: Optional[str] = None, mode: Optional[str] = None, space: Optional[dict] = None, low_cost_partial_config: Optional[dict] = None, cat_hp_cost: Optional[dict] = None, points_to_evaluate: Optional[List[dict]] = None, evaluated_rewards: Optional[List] = None, time_budget_s: Union[int, float] = None, num_samples: Optional[int] = None, resource_attr: Optional[str] = None, min_resource: Optional[float] = None, max_resource: Optional[float] = None, reduction_factor: Optional[float] = None, global_search_alg: Optional[Searcher] = None, config_constraints: Optional[
+            List[Tuple[Callable[[dict], float], str, float]]
+        ] = None, metric_constraints: Optional[List[Tuple[str, str, float]]] = None, seed: Optional[int] = 20, cost_attr: Optional[str] = "auto", experimental: Optional[bool] = False, lexico_objectives: Optional[dict] = None, use_incumbent_result_in_evaluation=False, allow_empty_config=False)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `metric` - A string of the metric name to optimize for.
+- `mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `space` - A dictionary to specify the search space.
+- `low_cost_partial_config` - A dictionary from a subset of
+  controlled dimensions to the initial low-cost values.
+  E.g., ```{'n_estimators': 4, 'max_leaves': 4}```.
+- `cat_hp_cost` - A dictionary from a subset of categorical dimensions
+  to the relative cost of each choice.
+  E.g., ```{'tree_method': [1, 1, 2]}```.
+  I.e., the relative cost of the three choices of 'tree_method'
+  is 1, 1 and 2 respectively.
+- `points_to_evaluate` - Initial parameter suggestions to be run first.
+- `evaluated_rewards` _list_ - If you have previously evaluated the
+  parameters passed in as points_to_evaluate you can avoid
+  re-running those trials by passing in the reward attributes
+  as a list so the optimiser can be told the results without
+  needing to re-compute the trial. Must be the same or shorter length than
+  points_to_evaluate. When provided, `mode` must be specified.
+- `time_budget_s` - int or float | Time budget in seconds.
+- `num_samples` - int | The number of configs to try.
+- `resource_attr` - A string to specify the resource dimension and the best
+  performance is assumed to be at the max_resource.
+- `min_resource` - A float of the minimal resource to use for the resource_attr.
+- `max_resource` - A float of the maximal resource to use for the resource_attr.
+- `reduction_factor` - A float of the reduction factor used for
+  incremental pruning.
+- `global_search_alg` - A Searcher instance as the global search
+  instance. If omitted, Optuna is used. The following algos have
+  known issues when used as global_search_alg:
+  - HyperOptSearch raises exception sometimes
+  - TuneBOHB has its own scheduler
+- `config_constraints` - A list of config constraints to be satisfied.
+  E.g., ```config_constraints = [(mem_size, '<=', 1024**3)]```.
+  `mem_size` is a function which produces a float number for the bytes
+  needed for a config.
+  It is used to skip configs which do not fit in memory.
+- `metric_constraints` - A list of metric constraints to be satisfied.
+  E.g., `['precision', '>=', 0.9]`. The sign can be ">=" or "<=".
+- `seed` - An integer of the random seed.
+- `cost_attr` - Choose from ["auto", None] to specify the attribute to evaluate the cost of different trials.
+  Default is "auto", which means that we will automatically chose the cost attribute to use (depending
+  on the nature of the resource budget). When cost_attr is set to None, cost differences between different trials will be omitted
+  in our search algorithm.
+- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective
+  optimization with lexicographic preferences. This is only supported in CFO currently.
+  When lexico_objectives is not None, the arguments metric, mode will be invalid.
+  This dictionary shall contain the  following fields of key-value pairs:
+  - "metrics":  a list of optimization objectives with the orders reflecting the priorities/preferences of the
+  objectives.
+  - "modes" (optional): a list of optimization modes (each mode either "min" or "max") corresponding to the
+  objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
+  - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
+  metric names (provided in "metric"), and the values are the numerical target values.
+  - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+  metric names (provided in "metrics"), and the values are the numerical tolerances values.
+  E.g.,
+  ```python
+  lexico_objectives = {
+- `"metrics"` - ["error_rate", "pred_time"],
+- `"modes"` - ["min", "min"],
+- `"tolerances"` - {"error_rate": 0.01, "pred_time": 0.0},
+- `"targets"` - {"error_rate": 0.0},
+  }
+  ```
+- `experimental` - A bool of whether to use experimental features.
+
+#### save
+
+```python
+def save(checkpoint_path: str)
+```
+
+save states to a checkpoint path.
+
+#### restore
+
+```python
+def restore(checkpoint_path: str)
+```
+
+restore states from checkpoint.
+
+#### on\_trial\_complete
+
+```python
+def on_trial_complete(trial_id: str, result: Optional[Dict] = None, error: bool = False)
+```
+
+search thread updater and cleaner.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_id: str, result: Dict)
+```
+
+receive intermediate result.
+
+#### suggest
+
+```python
+def suggest(trial_id: str) -> Optional[Dict]
+```
+
+choose thread, suggest a valid config.
+
+#### results
+
+```python
+@property
+def results() -> List[Dict]
+```
+
+A list of dicts of results for each evaluated configuration.
+
+Each dict has "config" and metric names as keys.
+The returned dict includes the initial results provided via `evaluated_reward`.
+
+## BlendSearchTuner Objects
+
+```python
+class BlendSearchTuner(BlendSearch,  NNITuner)
+```
+
+Tuner class for NNI.
+
+#### receive\_trial\_result
+
+```python
+def receive_trial_result(parameter_id, parameters, value, **kwargs)
+```
+
+Receive trial's final result.
+
+**Arguments**:
+
+- `parameter_id` - int.
+- `parameters` - object created by `generate_parameters()`.
+- `value` - final metrics of the trial, including default metric.
+
+#### generate\_parameters
+
+```python
+def generate_parameters(parameter_id, **kwargs) -> Dict
+```
+
+Returns a set of trial (hyper-)parameters, as a serializable object.
+
+**Arguments**:
+
+- `parameter_id` - int.
+
+#### update\_search\_space
+
+```python
+def update_search_space(search_space)
+```
+
+Required by NNI.
+
+Tuners are advised to support updating search space at run-time.
+If a tuner can only set search space once before generating first hyper-parameters,
+it should explicitly document this behaviour.
+
+**Arguments**:
+
+- `search_space` - JSON object created by experiment owner.
+
+## CFO Objects
+
+```python
+class CFO(BlendSearchTuner)
+```
+
+class for CFO algorithm.
+
+## RandomSearch Objects
+
+```python
+class RandomSearch(CFO)
+```
+
+Class for random search.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/cfo_cat.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/cfo_cat.md
new file mode 100644
index 0000000000..1ac3ce5f58
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/cfo_cat.md
@@ -0,0 +1,21 @@
+---
+sidebar_label: cfo_cat
+title: tune.searcher.cfo_cat
+---
+
+## FLOW2Cat Objects
+
+```python
+class FLOW2Cat(FLOW2)
+```
+
+Local search algorithm optimized for categorical variables.
+
+## CFOCat Objects
+
+```python
+class CFOCat(CFO)
+```
+
+CFO optimized for categorical variables.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/flow2.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/flow2.md
new file mode 100644
index 0000000000..eb100f497d
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/flow2.md
@@ -0,0 +1,150 @@
+---
+sidebar_label: flow2
+title: tune.searcher.flow2
+---
+
+## FLOW2 Objects
+
+```python
+class FLOW2(Searcher)
+```
+
+Local search algorithm FLOW2, with adaptive step size.
+
+#### \_\_init\_\_
+
+```python
+def __init__(init_config: dict, metric: Optional[str] = None, mode: Optional[str] = None, space: Optional[dict] = None, resource_attr: Optional[str] = None, min_resource: Optional[float] = None, max_resource: Optional[float] = None, resource_multiple_factor: Optional[float] = None, cost_attr: Optional[str] = "time_total_s", seed: Optional[int] = 20, lexico_objectives=None)
+```
+
+Constructor.
+
+**Arguments**:
+
+- `init_config` - a dictionary of a partial or full initial config,
+  e.g., from a subset of controlled dimensions
+  to the initial low-cost values.
+  E.g., {'epochs': 1}.
+- `metric` - A string of the metric name to optimize for.
+- `mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `space` - A dictionary to specify the search space.
+- `resource_attr` - A string to specify the resource dimension and the best
+  performance is assumed to be at the max_resource.
+- `min_resource` - A float of the minimal resource to use for the resource_attr.
+- `max_resource` - A float of the maximal resource to use for the resource_attr.
+- `resource_multiple_factor` - A float of the multiplicative factor
+  used for increasing resource.
+- `cost_attr` - A string of the attribute used for cost.
+- `seed` - An integer of the random seed.
+- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective
+  optimization with lexicographic preferences. When lexico_objectives is not None, the arguments metric,
+  mode will be invalid. This dictionary shall contain the following fields of key-value pairs:
+  - "metrics":  a list of optimization objectives with the orders reflecting the priorities/preferences of the
+  objectives.
+  - "modes" (optional): a list of optimization modes (each mode either "min" or "max") corresponding to the
+  objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives
+  - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
+  metric names (provided in "metric"), and the values are the numerical target values.
+  - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+  metric names (provided in "metrics"), and the values are the numerical tolerances values.
+  E.g.,
+  ```python
+  lexico_objectives = {
+- `"metrics"` - ["error_rate", "pred_time"],
+- `"modes"` - ["min", "min"],
+- `"tolerances"` - {"error_rate": 0.01, "pred_time": 0.0},
+- `"targets"` - {"error_rate": 0.0},
+  }
+  ```
+
+#### complete\_config
+
+```python
+def complete_config(partial_config: Dict, lower: Optional[Dict] = None, upper: Optional[Dict] = None) -> Tuple[Dict, Dict]
+```
+
+Generate a complete config from the partial config input.
+
+Add minimal resource to config if available.
+
+#### normalize
+
+```python
+def normalize(config, recursive=False) -> Dict
+```
+
+normalize each dimension in config to [0,1].
+
+#### denormalize
+
+```python
+def denormalize(config)
+```
+
+denormalize each dimension in config from [0,1].
+
+#### on\_trial\_complete
+
+```python
+def on_trial_complete(trial_id: str, result: Optional[Dict] = None, error: bool = False)
+```
+
+Compare with incumbent.
+If better, move, reset num_complete and num_proposed.
+If not better and num_complete >= 2*dim, num_allowed += 2.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_id: str, result: Dict)
+```
+
+Early update of incumbent.
+
+#### suggest
+
+```python
+def suggest(trial_id: str) -> Optional[Dict]
+```
+
+Suggest a new config, one of the following cases:
+1. same incumbent, increase resource.
+2. same resource, move from the incumbent to a random direction.
+3. same resource, move from the incumbent to the opposite direction.
+
+#### can\_suggest
+
+```python
+@property
+def can_suggest() -> bool
+```
+
+Can't suggest if 2*dim configs have been proposed for the incumbent
+while fewer are completed.
+
+#### config\_signature
+
+```python
+def config_signature(config, space: Dict = None) -> tuple
+```
+
+Return the signature tuple of a config.
+
+#### converged
+
+```python
+@property
+def converged() -> bool
+```
+
+Whether the local search has converged.
+
+#### reach
+
+```python
+def reach(other: Searcher) -> bool
+```
+
+whether the incumbent can reach the incumbent of other.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/online_searcher.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/online_searcher.md
new file mode 100644
index 0000000000..60bfd24d18
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/online_searcher.md
@@ -0,0 +1,62 @@
+---
+sidebar_label: online_searcher
+title: tune.searcher.online_searcher
+---
+
+## BaseSearcher Objects
+
+```python
+class BaseSearcher()
+```
+
+Abstract class for an online searcher.
+
+## ChampionFrontierSearcher Objects
+
+```python
+class ChampionFrontierSearcher(BaseSearcher)
+```
+
+The ChampionFrontierSearcher class.
+
+NOTE about the correspondence about this code and the research paper:
+[ChaCha for Online AutoML](https://arxiv.org/pdf/2106.04815.pdf).
+This class serves the role of ConfigOralce as described in the paper.
+
+#### \_\_init\_\_
+
+```python
+def __init__(init_config: Dict, space: Optional[Dict] = None, metric: Optional[str] = None, mode: Optional[str] = None, random_seed: Optional[int] = 2345, online_trial_args: Optional[Dict] = {}, nonpoly_searcher_name: Optional[str] = "CFO")
+```
+
+Constructor.
+
+**Arguments**:
+
+- `init_config` - A dictionary of initial configuration.
+- `space` - A dictionary to specify the search space.
+- `metric` - A string of the metric name to optimize for.
+- `mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `random_seed` - An integer of the random seed.
+- `online_trial_args` - A dictionary to specify the online trial
+  arguments for experimental purpose.
+- `nonpoly_searcher_name` - A string to specify the search algorithm
+  for nonpoly hyperparameters.
+
+#### set\_search\_properties
+
+```python
+def set_search_properties(metric: Optional[str] = None, mode: Optional[str] = None, config: Optional[Dict] = {}, setting: Optional[Dict] = {}, init_call: Optional[bool] = False)
+```
+
+Construct search space with the given config, and setup the search.
+
+#### next\_trial
+
+```python
+def next_trial()
+```
+
+Return a trial from the _challenger_list.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/search_thread.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/search_thread.md
new file mode 100644
index 0000000000..3471fa55da
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/search_thread.md
@@ -0,0 +1,54 @@
+---
+sidebar_label: search_thread
+title: tune.searcher.search_thread
+---
+
+## SearchThread Objects
+
+```python
+class SearchThread()
+```
+
+Class of global or local search thread.
+
+#### \_\_init\_\_
+
+```python
+def __init__(mode: str = "min", search_alg: Optional[Searcher] = None, cost_attr: Optional[str] = "time_total_s", eps: Optional[float] = 1.0)
+```
+
+When search_alg is omitted, use local search FLOW2.
+
+#### suggest
+
+```python
+def suggest(trial_id: str) -> Optional[Dict]
+```
+
+Use the suggest() of the underlying search algorithm.
+
+#### on\_trial\_complete
+
+```python
+def on_trial_complete(trial_id: str, result: Optional[Dict] = None, error: bool = False)
+```
+
+Update the statistics of the thread.
+
+#### reach
+
+```python
+def reach(thread) -> bool
+```
+
+Whether the incumbent can reach the incumbent of thread.
+
+#### can\_suggest
+
+```python
+@property
+def can_suggest() -> bool
+```
+
+Whether the thread can suggest new configs.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/suggestion.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/suggestion.md
new file mode 100644
index 0000000000..c9dce3b318
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/suggestion.md
@@ -0,0 +1,228 @@
+---
+sidebar_label: suggestion
+title: tune.searcher.suggestion
+---
+
+## Searcher Objects
+
+```python
+class Searcher()
+```
+
+Abstract class for wrapping suggesting algorithms.
+Custom algorithms can extend this class easily by overriding the
+`suggest` method provide generated parameters for the trials.
+Any subclass that implements ``__init__`` must also call the
+constructor of this class: ``super(Subclass, self).__init__(...)``.
+To track suggestions and their corresponding evaluations, the method
+`suggest` will be passed a trial_id, which will be used in
+subsequent notifications.
+Not all implementations support multi objectives.
+
+**Arguments**:
+
+- `metric` _str or list_ - The training result objective value attribute. If
+  list then list of training result objective value attributes
+- `mode` _str or list_ - If string One of {min, max}. If list then
+  list of max and min, determines whether objective is minimizing
+  or maximizing the metric attribute. Must match type of metric.
+  
+```python
+class ExampleSearch(Searcher):
+    def __init__(self, metric="mean_loss", mode="min", **kwargs):
+        super(ExampleSearch, self).__init__(
+            metric=metric, mode=mode, **kwargs)
+        self.optimizer = Optimizer()
+        self.configurations = {}
+    def suggest(self, trial_id):
+        configuration = self.optimizer.query()
+        self.configurations[trial_id] = configuration
+    def on_trial_complete(self, trial_id, result, **kwargs):
+        configuration = self.configurations[trial_id]
+        if result and self.metric in result:
+            self.optimizer.update(configuration, result[self.metric])
+tune.run(trainable_function, search_alg=ExampleSearch())
+```
+
+#### set\_search\_properties
+
+```python
+def set_search_properties(metric: Optional[str], mode: Optional[str], config: Dict) -> bool
+```
+
+Pass search properties to searcher.
+This method acts as an alternative to instantiating search algorithms
+with their own specific search spaces. Instead they can accept a
+Tune config through this method. A searcher should return ``True``
+if setting the config was successful, or ``False`` if it was
+unsuccessful, e.g. when the search space has already been set.
+
+**Arguments**:
+
+- `metric` _str_ - Metric to optimize
+- `mode` _str_ - One of ["min", "max"]. Direction to optimize.
+- `config` _dict_ - Tune config dict.
+
+#### on\_trial\_result
+
+```python
+def on_trial_result(trial_id: str, result: Dict)
+```
+
+Optional notification for result during training.
+Note that by default, the result dict may include NaNs or
+may not include the optimization metric. It is up to the
+subclass implementation to preprocess the result to
+avoid breaking the optimization process.
+
+**Arguments**:
+
+- `trial_id` _str_ - A unique string ID for the trial.
+- `result` _dict_ - Dictionary of metrics for current training progress.
+  Note that the result dict may include NaNs or
+  may not include the optimization metric. It is up to the
+  subclass implementation to preprocess the result to
+  avoid breaking the optimization process.
+
+#### metric
+
+```python
+@property
+def metric() -> str
+```
+
+The training result objective value attribute.
+
+#### mode
+
+```python
+@property
+def mode() -> str
+```
+
+Specifies if minimizing or maximizing the metric.
+
+## ConcurrencyLimiter Objects
+
+```python
+class ConcurrencyLimiter(Searcher)
+```
+
+A wrapper algorithm for limiting the number of concurrent trials.
+
+**Arguments**:
+
+- `searcher` _Searcher_ - Searcher object that the
+  ConcurrencyLimiter will manage.
+- `max_concurrent` _int_ - Maximum concurrent samples from the underlying
+  searcher.
+- `batch` _bool_ - Whether to wait for all concurrent samples
+  to finish before updating the underlying searcher.
+
+**Example**:
+
+```python
+from ray.tune.suggest import ConcurrencyLimiter  # ray version < 2
+search_alg = HyperOptSearch(metric="accuracy")
+search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)
+tune.run(trainable, search_alg=search_alg)
+```
+
+#### validate\_warmstart
+
+```python
+def validate_warmstart(parameter_names: List[str], points_to_evaluate: List[Union[List, Dict]], evaluated_rewards: List, validate_point_name_lengths: bool = True)
+```
+
+Generic validation of a Searcher's warm start functionality.
+Raises exceptions in case of type and length mismatches between
+parameters.
+If ``validate_point_name_lengths`` is False, the equality of lengths
+between ``points_to_evaluate`` and ``parameter_names`` will not be
+validated.
+
+## OptunaSearch Objects
+
+```python
+class OptunaSearch(Searcher)
+```
+
+A wrapper around Optuna to provide trial suggestions.
+[Optuna](https://optuna.org/)
+is a hyperparameter optimization library.
+In contrast to other libraries, it employs define-by-run style
+hyperparameter definitions.
+This Searcher is a thin wrapper around Optuna's search algorithms.
+You can pass any Optuna sampler, which will be used to generate
+hyperparameter suggestions.
+
+**Arguments**:
+
+- `space` _dict|Callable_ - Hyperparameter search space definition for
+  Optuna's sampler. This can be either a class `dict` with
+  parameter names as keys and ``optuna.distributions`` as values,
+  or a Callable - in which case, it should be a define-by-run
+  function using ``optuna.trial`` to obtain the hyperparameter
+  values. The function should return either a class `dict` of
+  constant values with names as keys, or None.
+  For more information, see
+  [tutorial](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html).
+  Warning - No actual computation should take place in the define-by-run
+  function. Instead, put the training logic inside the function
+  or class trainable passed to tune.run.
+- `metric` _str_ - The training result objective value attribute. If None
+  but a mode was passed, the anonymous metric `_metric` will be used
+  per default.
+- `mode` _str_ - One of {min, max}. Determines whether objective is
+  minimizing or maximizing the metric attribute.
+- `points_to_evaluate` _list_ - Initial parameter suggestions to be run
+  first. This is for when you already have some good parameters
+  you want to run first to help the algorithm make better suggestions
+  for future parameters. Needs to be a list of dicts containing the
+  configurations.
+- `sampler` _optuna.samplers.BaseSampler_ - Optuna sampler used to
+  draw hyperparameter configurations. Defaults to ``TPESampler``.
+- `seed` _int_ - Seed to initialize sampler with. This parameter is only
+  used when ``sampler=None``. In all other cases, the sampler
+  you pass should be initialized with the seed already.
+- `evaluated_rewards` _list_ - If you have previously evaluated the
+  parameters passed in as points_to_evaluate you can avoid
+  re-running those trials by passing in the reward attributes
+  as a list so the optimiser can be told the results without
+  needing to re-compute the trial. Must be the same length as
+  points_to_evaluate.
+  
+  Tune automatically converts search spaces to Optuna's format:
+  
+````python
+from ray.tune.suggest.optuna import OptunaSearch  # ray version < 2
+config = { "a": tune.uniform(6, 8),
+           "b": tune.loguniform(1e-4, 1e-2)}
+optuna_search = OptunaSearch(metric="loss", mode="min")
+tune.run(trainable, config=config, search_alg=optuna_search)
+````
+  
+  If you would like to pass the search space manually, the code would
+  look like this:
+  
+```python
+from ray.tune.suggest.optuna import OptunaSearch  # ray version < 2
+import optuna
+config = { "a": optuna.distributions.UniformDistribution(6, 8),
+           "b": optuna.distributions.LogUniformDistribution(1e-4, 1e-2)}
+optuna_search = OptunaSearch(space,metric="loss",mode="min")
+tune.run(trainable, search_alg=optuna_search)
+# Equivalent Optuna define-by-run function approach:
+def define_search_space(trial: optuna.Trial):
+    trial.suggest_float("a", 6, 8)
+    trial.suggest_float("b", 1e-4, 1e-2, log=True)
+    # training logic goes into trainable, this is just
+    # for search space definition
+optuna_search = OptunaSearch(
+    define_search_space,
+    metric="loss",
+    mode="min")
+tune.run(trainable, search_alg=optuna_search)
+.. versionadded:: 0.8.8
+```
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/searcher/variant_generator.md b/website/versioned_docs/version-1.0.4/reference/tune/searcher/variant_generator.md
new file mode 100644
index 0000000000..e44064de85
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/searcher/variant_generator.md
@@ -0,0 +1,54 @@
+---
+sidebar_label: variant_generator
+title: tune.searcher.variant_generator
+---
+
+## TuneError Objects
+
+```python
+class TuneError(Exception)
+```
+
+General error class raised by ray.tune.
+
+#### generate\_variants
+
+```python
+def generate_variants(unresolved_spec: Dict, constant_grid_search: bool = False, random_state: "RandomState" = None) -> Generator[Tuple[Dict, Dict], None, None]
+```
+
+Generates variants from a spec (dict) with unresolved values.
+There are two types of unresolved values:
+Grid search: These define a grid search over values. For example, the
+following grid search values in a spec will produce six distinct
+variants in combination:
+"activation": grid_search(["relu", "tanh"])
+"learning_rate": grid_search([1e-3, 1e-4, 1e-5])
+Lambda functions: These are evaluated to produce a concrete value, and
+can express dependencies or conditional distributions between values.
+They can also be used to express random search (e.g., by calling
+into the `random` or `np` module).
+"cpu": lambda spec: spec.config.num_workers
+"batch_size": lambda spec: random.uniform(1, 1000)
+Finally, to support defining specs in plain JSON / YAML, grid search
+and lambda functions can also be defined alternatively as follows:
+"activation": {"grid_search": ["relu", "tanh"]}
+"cpu": {"eval": "spec.config.num_workers"}
+Use `format_vars` to format the returned dict of hyperparameters.
+
+**Yields**:
+
+  (Dict of resolved variables, Spec object)
+
+#### grid\_search
+
+```python
+def grid_search(values: List) -> Dict[str, List]
+```
+
+Convenience method for specifying grid search over a value.
+
+**Arguments**:
+
+- `values` - An iterable whose parameters will be gridded.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/space.md b/website/versioned_docs/version-1.0.4/reference/tune/space.md
new file mode 100644
index 0000000000..ec354d52ca
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/space.md
@@ -0,0 +1,67 @@
+---
+sidebar_label: space
+title: tune.space
+---
+
+#### define\_by\_run\_func
+
+```python
+def define_by_run_func(trial, space: Dict, path: str = "") -> Optional[Dict[str, Any]]
+```
+
+Define-by-run function to create the search space.
+
+**Returns**:
+
+  A dict with constant values.
+
+#### unflatten\_hierarchical
+
+```python
+def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]
+```
+
+Unflatten hierarchical config.
+
+#### add\_cost\_to\_space
+
+```python
+def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict)
+```
+
+Update the space in place by adding low_cost_point and choice_cost.
+
+**Returns**:
+
+  A dict with constant values.
+
+#### normalize
+
+```python
+def normalize(config: Dict, space: Dict, reference_config: Dict, normalized_reference_config: Dict, recursive: bool = False)
+```
+
+Normalize config in space according to reference_config.
+
+Normalize each dimension in config to [0,1].
+
+#### indexof
+
+```python
+def indexof(domain: Dict, config: Dict) -> int
+```
+
+Find the index of config in domain.categories.
+
+#### complete\_config
+
+```python
+def complete_config(partial_config: Dict, space: Dict, flow2, disturb: bool = False, lower: Optional[Dict] = None, upper: Optional[Dict] = None) -> Tuple[Dict, Dict]
+```
+
+Complete partial config in space.
+
+**Returns**:
+
+  config, space.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/trial.md b/website/versioned_docs/version-1.0.4/reference/tune/trial.md
new file mode 100644
index 0000000000..7af0ac3246
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/trial.md
@@ -0,0 +1,46 @@
+---
+sidebar_label: trial
+title: tune.trial
+---
+
+#### unflatten\_dict
+
+```python
+def unflatten_dict(dt, delimiter="/")
+```
+
+Unflatten dict. Does not support unflattening lists.
+
+## Trial Objects
+
+```python
+class Trial()
+```
+
+A trial object holds the state for one model training run.
+Trials are themselves managed by the TrialRunner class, which implements
+the event loop for submitting trial runs to a Ray cluster.
+Trials start in the PENDING state, and transition to RUNNING once started.
+On error it transitions to ERROR, otherwise TERMINATED on success.
+
+**Attributes**:
+
+- `trainable_name` _str_ - Name of the trainable object to be executed.
+- `config` _dict_ - Provided configuration dictionary with evaluated params.
+- `trial_id` _str_ - Unique identifier for the trial.
+- `local_dir` _str_ - Local_dir as passed to tune.run.
+- `logdir` _str_ - Directory where the trial logs are saved.
+- `evaluated_params` _dict_ - Evaluated parameters by search algorithm,
+- `experiment_tag` _str_ - Identifying trial name to show in the console.
+- `resources` _Resources_ - Amount of resources that this trial will use.
+- `status` _str_ - One of PENDING, RUNNING, PAUSED, TERMINATED, ERROR/
+- `error_file` _str_ - Path to the errors that this trial has raised.
+
+#### set\_status
+
+```python
+def set_status(status)
+```
+
+Sets the status of the trial.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/trial_runner.md b/website/versioned_docs/version-1.0.4/reference/tune/trial_runner.md
new file mode 100644
index 0000000000..6408a0140f
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/trial_runner.md
@@ -0,0 +1,86 @@
+---
+sidebar_label: trial_runner
+title: tune.trial_runner
+---
+
+## Nologger Objects
+
+```python
+class Nologger()
+```
+
+Logger without logging.
+
+## SimpleTrial Objects
+
+```python
+class SimpleTrial(Trial)
+```
+
+A simple trial class.
+
+## BaseTrialRunner Objects
+
+```python
+class BaseTrialRunner()
+```
+
+Implementation of a simple trial runner.
+
+Note that the caller usually should not mutate trial state directly.
+
+#### get\_trials
+
+```python
+def get_trials()
+```
+
+Returns the list of trials managed by this TrialRunner.
+
+Note that the caller usually should not mutate trial state directly.
+
+#### add\_trial
+
+```python
+def add_trial(trial)
+```
+
+Adds a new trial to this TrialRunner.
+
+Trials may be added at any time.
+
+**Arguments**:
+
+- `trial` _Trial_ - Trial to queue.
+
+#### stop\_trial
+
+```python
+def stop_trial(trial)
+```
+
+Stops trial.
+
+## SequentialTrialRunner Objects
+
+```python
+class SequentialTrialRunner(BaseTrialRunner)
+```
+
+Implementation of the sequential trial runner.
+
+#### step
+
+```python
+def step() -> Trial
+```
+
+Runs one step of the trial event loop.
+
+Callers should typically run this method repeatedly in a loop. They
+may inspect or modify the runner's state in between calls to step().
+
+**Returns**:
+
+  a trial to run.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/tune.md b/website/versioned_docs/version-1.0.4/reference/tune/tune.md
new file mode 100644
index 0000000000..9c3e015763
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/tune.md
@@ -0,0 +1,252 @@
+---
+sidebar_label: tune
+title: tune.tune
+---
+
+## ExperimentAnalysis Objects
+
+```python
+class ExperimentAnalysis(EA)
+```
+
+Class for storing the experiment results.
+
+#### report
+
+```python
+def report(_metric=None, **kwargs)
+```
+
+A function called by the HPO application to report final or intermediate
+results.
+
+**Example**:
+
+  
+```python
+import time
+from flaml import tune
+
+def compute_with_config(config):
+    current_time = time.time()
+    metric2minimize = (round(config['x'])-95000)**2
+    time2eval = time.time() - current_time
+    tune.report(metric2minimize=metric2minimize, time2eval=time2eval)
+
+analysis = tune.run(
+    compute_with_config,
+    config={
+        'x': tune.lograndint(lower=1, upper=1000000),
+        'y': tune.randint(lower=1, upper=1000000)
+    },
+    metric='metric2minimize', mode='min',
+    num_samples=1000000, time_budget_s=60, use_ray=False)
+
+print(analysis.trials[-1].last_result)
+```
+  
+
+**Arguments**:
+
+- `_metric` - Optional default anonymous metric for ``tune.report(value)``.
+  (For compatibility with ray.tune.report)
+- `**kwargs` - Any key value pair to be reported.
+  
+
+**Raises**:
+
+  StopIteration (when not using ray, i.e., _use_ray=False):
+  A StopIteration exception is raised if the trial has been signaled to stop.
+  SystemExit (when using ray):
+  A SystemExit exception is raised if the trial has been signaled to stop by ray.
+
+#### run
+
+```python
+def run(evaluation_function, config: Optional[dict] = None, low_cost_partial_config: Optional[dict] = None, cat_hp_cost: Optional[dict] = None, metric: Optional[str] = None, mode: Optional[str] = None, time_budget_s: Union[int, float] = None, points_to_evaluate: Optional[List[dict]] = None, evaluated_rewards: Optional[List] = None, resource_attr: Optional[str] = None, min_resource: Optional[float] = None, max_resource: Optional[float] = None, reduction_factor: Optional[float] = None, scheduler=None, search_alg=None, verbose: Optional[int] = 2, local_dir: Optional[str] = None, num_samples: Optional[int] = 1, resources_per_trial: Optional[dict] = None, config_constraints: Optional[
+        List[Tuple[Callable[[dict], float], str, float]]
+    ] = None, metric_constraints: Optional[List[Tuple[str, str, float]]] = None, max_failure: Optional[int] = 100, use_ray: Optional[bool] = False, use_incumbent_result_in_evaluation: Optional[bool] = None, log_file_name: Optional[str] = None, lexico_objectives: Optional[dict] = None, **ray_args, ,)
+```
+
+The trigger for HPO.
+
+**Example**:
+
+  
+```python
+import time
+from flaml import tune
+
+def compute_with_config(config):
+    current_time = time.time()
+    metric2minimize = (round(config['x'])-95000)**2
+    time2eval = time.time() - current_time
+    tune.report(metric2minimize=metric2minimize, time2eval=time2eval)
+    # if the evaluation fails unexpectedly and the exception is caught,
+    # and it doesn't inform the goodness of the config,
+    # return {}
+    # if the failure indicates a config is bad,
+    # report a bad metric value like np.inf or -np.inf
+    # depending on metric mode being min or max
+
+analysis = tune.run(
+    compute_with_config,
+    config={
+        'x': tune.lograndint(lower=1, upper=1000000),
+        'y': tune.randint(lower=1, upper=1000000)
+    },
+    metric='metric2minimize', mode='min',
+    num_samples=-1, time_budget_s=60, use_ray=False)
+
+print(analysis.trials[-1].last_result)
+```
+  
+
+**Arguments**:
+
+- `evaluation_function` - A user-defined evaluation function.
+  It takes a configuration as input, outputs a evaluation
+  result (can be a numerical value or a dictionary of string
+  and numerical value pairs) for the input configuration.
+  For machine learning tasks, it usually involves training and
+  scoring a machine learning model, e.g., through validation loss.
+- `config` - A dictionary to specify the search space.
+- `low_cost_partial_config` - A dictionary from a subset of
+  controlled dimensions to the initial low-cost values.
+  e.g., ```{'n_estimators': 4, 'max_leaves': 4}```
+  
+- `cat_hp_cost` - A dictionary from a subset of categorical dimensions
+  to the relative cost of each choice.
+  e.g., ```{'tree_method': [1, 1, 2]}```
+  i.e., the relative cost of the
+  three choices of 'tree_method' is 1, 1 and 2 respectively
+- `metric` - A string of the metric name to optimize for.
+- `mode` - A string in ['min', 'max'] to specify the objective as
+  minimization or maximization.
+- `time_budget_s` - int or float | The time budget in seconds.
+- `points_to_evaluate` - A list of initial hyperparameter
+  configurations to run first.
+- `evaluated_rewards` _list_ - If you have previously evaluated the
+  parameters passed in as points_to_evaluate you can avoid
+  re-running those trials by passing in the reward attributes
+  as a list so the optimiser can be told the results without
+  needing to re-compute the trial. Must be the same or shorter length than
+  points_to_evaluate.
+  e.g.,
+  
+```python
+points_to_evaluate = [
+    {"b": .99, "cost_related": {"a": 3}},
+    {"b": .99, "cost_related": {"a": 2}},
+]
+evaluated_rewards = [3.0]
+```
+  
+  means that you know the reward for the first config in
+  points_to_evaluate is 3.0 and want to inform run().
+  
+- `resource_attr` - A string to specify the resource dimension used by
+  the scheduler via "scheduler".
+- `min_resource` - A float of the minimal resource to use for the resource_attr.
+- `max_resource` - A float of the maximal resource to use for the resource_attr.
+- `reduction_factor` - A float of the reduction factor used for incremental
+  pruning.
+- `scheduler` - A scheduler for executing the experiment. Can be None, 'flaml',
+  'asha' (or  'async_hyperband', 'asynchyperband') or a custom instance of the TrialScheduler class. Default is None:
+  in this case when resource_attr is provided, the 'flaml' scheduler will be
+  used, otherwise no scheduler will be used. When set 'flaml', an
+  authentic scheduler implemented in FLAML will be used. It does not
+  require users to report intermediate results in evaluation_function.
+  Find more details about this scheduler in this paper
+  https://arxiv.org/pdf/1911.04706.pdf).
+  When set 'asha', the input for arguments "resource_attr",
+  "min_resource", "max_resource" and "reduction_factor" will be passed
+  to ASHA's "time_attr",  "max_t", "grace_period" and "reduction_factor"
+  respectively. You can also provide a self-defined scheduler instance
+  of the TrialScheduler class. When 'asha' or self-defined scheduler is
+  used, you usually need to report intermediate results in the evaluation
+  function via 'tune.report()'.
+  If you would like to do some cleanup opearation when the trial is stopped
+  by the scheduler, you can catch the `StopIteration` (when not using ray)
+  or `SystemExit` (when using ray) exception explicitly,
+  as shown in the following example.
+  Please find more examples using different types of schedulers
+  and how to set up the corresponding evaluation functions in
+  test/tune/test_scheduler.py, and test/tune/example_scheduler.py.
+```python
+def easy_objective(config):
+    width, height = config["width"], config["height"]
+    for step in range(config["steps"]):
+        intermediate_score = evaluation_fn(step, width, height)
+        try:
+            tune.report(iterations=step, mean_loss=intermediate_score)
+        except (StopIteration, SystemExit):
+            # do cleanup operation here
+            return
+```
+- `search_alg` - An instance of BlendSearch as the search algorithm
+  to be used. The same instance can be used for iterative tuning.
+  e.g.,
+  
+```python
+from flaml import BlendSearch
+algo = BlendSearch(metric='val_loss', mode='min',
+        space=search_space,
+        low_cost_partial_config=low_cost_partial_config)
+for i in range(10):
+    analysis = tune.run(compute_with_config,
+        search_alg=algo, use_ray=False)
+    print(analysis.trials[-1].last_result)
+```
+  
+- `verbose` - 0, 1, 2, or 3. Verbosity mode for ray if ray backend is used.
+  0 = silent, 1 = only status updates, 2 = status and brief trial
+  results, 3 = status and detailed trial results. Defaults to 2.
+- `local_dir` - A string of the local dir to save ray logs if ray backend is
+  used; or a local dir to save the tuning log.
+- `num_samples` - An integer of the number of configs to try. Defaults to 1.
+- `resources_per_trial` - A dictionary of the hardware resources to allocate
+  per trial, e.g., `{'cpu': 1}`. It is only valid when using ray backend
+  (by setting 'use_ray = True'). It shall be used when you need to do
+  [parallel tuning](../../Use-Cases/Tune-User-Defined-Function#parallel-tuning).
+- `config_constraints` - A list of config constraints to be satisfied.
+  e.g., ```config_constraints = [(mem_size, '<=', 1024**3)]```
+  
+  mem_size is a function which produces a float number for the bytes
+  needed for a config.
+  It is used to skip configs which do not fit in memory.
+- `metric_constraints` - A list of metric constraints to be satisfied.
+  e.g., `['precision', '>=', 0.9]`. The sign can be ">=" or "<=".
+- `max_failure` - int | the maximal consecutive number of failures to sample
+  a trial before the tuning is terminated.
+- `use_ray` - A boolean of whether to use ray as the backend.
+- `log_file_name` - A string of the log file name. Default to None.
+  When set to None:
+  if local_dir is not given, no log file is created;
+  if local_dir is given, the log file name will be autogenerated under local_dir.
+  Only valid when verbose > 0 or use_ray is True.
+- `lexico_objectives` - dict, default=None | It specifics information needed to perform multi-objective
+  optimization with lexicographic preferences. When lexico_objectives is not None, the arguments metric,
+  mode, will be invalid, and flaml's tune uses CFO
+  as the `search_alg`, which makes the input (if provided) `search_alg' invalid.
+  This dictionary shall contain the following fields of key-value pairs:
+  - "metrics":  a list of optimization objectives with the orders reflecting the priorities/preferences of the
+  objectives.
+  - "modes" (optional): a list of optimization modes (each mode either "min" or "max") corresponding to the
+  objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
+  - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
+  metric names (provided in "metric"), and the values are the numerical target values.
+  - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+  metric names (provided in "metrics"), and the values are the numerical tolerances values.
+  E.g.,
+  ```python
+  lexico_objectives = {
+- `"metrics"` - ["error_rate", "pred_time"],
+- `"modes"` - ["min", "min"],
+- `"tolerances"` - {"error_rate": 0.01, "pred_time": 0.0},
+- `"targets"` - {"error_rate": 0.0},
+  }
+  ```
+- `**ray_args` - keyword arguments to pass to ray.tune.run().
+  Only valid when use_ray=True.
+
diff --git a/website/versioned_docs/version-1.0.4/reference/tune/utils.md b/website/versioned_docs/version-1.0.4/reference/tune/utils.md
new file mode 100644
index 0000000000..55a68b335b
--- /dev/null
+++ b/website/versioned_docs/version-1.0.4/reference/tune/utils.md
@@ -0,0 +1,21 @@
+---
+sidebar_label: utils
+title: tune.utils
+---
+
+#### choice
+
+```python
+def choice(categories: Sequence, order=None)
+```
+
+Sample a categorical value.
+Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from
+``np.random.choice([1, 2])``
+
+**Arguments**:
+
+- `categories` _Sequence_ - Sequence of categories to sample from.
+- `order` _bool_ - Whether the categories have an order. If None, will be decided autoamtically:
+  Numerical categories have an order, while string categories do not.
+
diff --git a/website/versioned_sidebars/version-1.0.4-sidebars.json b/website/versioned_sidebars/version-1.0.4-sidebars.json
new file mode 100644
index 0000000000..3534d9fc03
--- /dev/null
+++ b/website/versioned_sidebars/version-1.0.4-sidebars.json
@@ -0,0 +1,242 @@
+{
+  "version-1.0.4/docsSidebar": [
+    {
+      "type": "doc",
+      "id": "version-1.0.4/Getting-Started"
+    },
+    {
+      "type": "doc",
+      "id": "version-1.0.4/Installation"
+    },
+    {
+      "type": "category",
+      "collapsed": true,
+      "collapsible": true,
+      "label": "Use Cases",
+      "items": [
+        {
+          "type": "autogenerated",
+          "dirName": "Use-Cases"
+        }
+      ]
+    },
+    {
+      "type": "category",
+      "collapsed": true,
+      "collapsible": true,
+      "label": "Examples",
+      "items": [
+        {
+          "type": "autogenerated",
+          "dirName": "Examples"
+        }
+      ]
+    },
+    {
+      "type": "doc",
+      "id": "version-1.0.4/Contribute"
+    },
+    {
+      "type": "doc",
+      "id": "version-1.0.4/Research"
+    }
+  ],
+  "version-1.0.4/referenceSideBar": [
+    {
+      "items": [
+        {
+          "items": [
+            {
+              "items": [
+                {
+                  "items": [
+                    {
+                      "type": "doc",
+                      "id": "version-1.0.4/reference/automl/nlp/huggingface/trainer"
+                    },
+                    {
+                      "type": "doc",
+                      "id": "version-1.0.4/reference/automl/nlp/huggingface/training_args"
+                    },
+                    {
+                      "type": "doc",
+                      "id": "version-1.0.4/reference/automl/nlp/huggingface/utils"
+                    }
+                  ],
+                  "label": "automl.nlp.huggingface",
+                  "type": "category",
+                  "collapsible": true,
+                  "collapsed": true
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/automl/nlp/utils"
+                }
+              ],
+              "label": "automl.nlp",
+              "type": "category",
+              "collapsible": true,
+              "collapsed": true
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/automl/automl"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/automl/data"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/automl/ml"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/automl/model"
+            }
+          ],
+          "label": "automl",
+          "type": "category",
+          "collapsible": true,
+          "collapsed": true
+        },
+        {
+          "items": [
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/default/estimator"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/default/greedy"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/default/portfolio"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/default/suggest"
+            }
+          ],
+          "label": "default",
+          "type": "category",
+          "collapsible": true,
+          "collapsed": true
+        },
+        {
+          "items": [
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/onlineml/autovw"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/onlineml/trial"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/onlineml/trial_runner"
+            }
+          ],
+          "label": "onlineml",
+          "type": "category",
+          "collapsible": true,
+          "collapsed": true
+        },
+        {
+          "items": [
+            {
+              "items": [
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/scheduler/online_scheduler"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/scheduler/trial_scheduler"
+                }
+              ],
+              "label": "tune.scheduler",
+              "type": "category",
+              "collapsible": true,
+              "collapsed": true
+            },
+            {
+              "items": [
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/blendsearch"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/cfo_cat"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/flow2"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/online_searcher"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/search_thread"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/suggestion"
+                },
+                {
+                  "type": "doc",
+                  "id": "version-1.0.4/reference/tune/searcher/variant_generator"
+                }
+              ],
+              "label": "tune.searcher",
+              "type": "category",
+              "collapsible": true,
+              "collapsed": true
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/analysis"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/sample"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/space"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/trial"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/trial_runner"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/tune"
+            },
+            {
+              "type": "doc",
+              "id": "version-1.0.4/reference/tune/utils"
+            }
+          ],
+          "label": "tune",
+          "type": "category",
+          "collapsible": true,
+          "collapsed": true
+        }
+      ],
+      "label": "Reference",
+      "type": "category",
+      "collapsible": true,
+      "collapsed": true
+    }
+  ]
+}
diff --git a/website/versions.json b/website/versions.json
new file mode 100644
index 0000000000..b0799b56e2
--- /dev/null
+++ b/website/versions.json
@@ -0,0 +1,3 @@
+[
+  "1.0.4"
+]