diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
new file mode 100644
index 0000000000..0bb33739f9
--- /dev/null
+++ b/.github/workflows/benchmark_nightly.yml
@@ -0,0 +1,89 @@
+name: Benchmark torchserve nightly
+
+on:
+  # run every day at 2:15am
+  schedule:
+    - cron:  '15 02 * * *'
+
+jobs:
+  nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        hardware: [cpu, gpu, inf1, inf2]
+    runs-on:
+      - self-hosted
+      - ${{ matrix.hardware }}
+    timeout-minutes: 1320
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          cd $RUNNER_WORKSPACE
+          pwd
+          cd ..
+          pwd
+          rm -rf _tool
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+          architecture: x64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y apache2-utils
+          pip install -r benchmarks/requirements-ab.txt
+      - name: Benchmark cpu nightly
+        if: ${{ matrix.hardware == 'cpu' }}
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_cpu.yaml --skip false
+      - name: Benchmark gpu nightly
+        if: ${{ matrix.hardware == 'gpu' }}
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_gpu.yaml --skip false
+      - name: Benchmark inf1 nightly
+        if: ${{ matrix.hardware == 'inf1' }}
+        env:
+          NEURON_RT_NUM_CORES: 4
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuron.yaml --skip false
+      - name: Benchmark inf2 nightly
+        if: ${{ matrix.hardware == 'inf2' }}
+        env:
+          NEURON_RT_NUM_CORES: 1
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
+      - name: Save benchmark artifacts
+        uses: actions/upload-artifact@v2
+        with:
+          name: nightly ${{ matrix.hardware }} artifact
+          path: /tmp/ts_benchmark
+      - name: Download benchmark artifacts for auto validation
+        uses: dawidd6/action-download-artifact@v2
+        with:
+          workflow: ${{ github.event.workflow_run.workflow_id }}
+          workflow_conclusion: success
+          if_no_artifact_found: ignore
+          path: /tmp/ts_artifacts
+          name: ${{ matrix.hardware }}_benchmark_validation
+      - name: Validate Benchmark result
+        run: python benchmarks/validate_report.py --input-artifacts-dir /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
+      - name: Update benchmark artifacts for auto validation
+        run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
+      - name: Upload the updated benchmark artifacts for auto validation
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ matrix.hardware }}_benchmark_validation
+          path: /tmp/ts_artifacts
+      - name: Open issue on failure
+        if: ${{ failure() && github.event_name  == 'schedule' && matrix.hardware == 'cpu' }}
+        uses: dacbd/create-issue-action@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: Nightly ${{ matrix.hardware }} benchmark failed
+          body:  Commit ${{ github.sha }} daily scheduled [CI run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) failed, please check why
+          assignees: ''
diff --git a/.github/workflows/benchmark_nightly_cpu.yml b/.github/workflows/benchmark_nightly_cpu.yml
deleted file mode 100644
index 6d91438574..0000000000
--- a/.github/workflows/benchmark_nightly_cpu.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-name: Benchmark torchserve cpu nightly
-
-on:
-  # run every day at 2:15am
-  schedule:
-    - cron:  '15 02 * * *'
-
-jobs:
-  nightly:
-    runs-on: [self-hosted, cpu]
-    timeout-minutes: 1320
-    steps:
-      - name: Clean up previous run
-        run: |
-          echo "Cleaning up previous run"
-          cd $RUNNER_WORKSPACE
-          pwd
-          cd ..
-          pwd
-          rm -rf _tool
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-          architecture: x64
-      - name: Setup Java 17
-        uses: actions/setup-java@v3
-        with:
-          distribution: 'zulu'
-          java-version: '17'
-      - name: Checkout TorchServe
-        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: |
-            sudo apt-get update -y
-            sudo apt-get install -y apache2-utils
-            pip install -r benchmarks/requirements-ab.txt
-            export omp_num_threads=1
-      - name: Benchmark cpu nightly
-        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_cpu.yaml --skip false
-      - name: Save benchmark artifacts
-        uses: actions/upload-artifact@v2
-        with:
-          name: nightly cpu artifact
-          path: /tmp/ts_benchmark
-      - name: Open issue on failure
-        if: ${{ failure() && github.event_name  == 'schedule' }}
-        uses: dacbd/create-issue-action@v1
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          title: Nightly CPU benchmark failed
-          body:  Commit ${{ github.sha }} daily scheduled [CI run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) failed, please check why
-          assignees: ''
diff --git a/.github/workflows/benchmark_nightly_gpu.yml b/.github/workflows/benchmark_nightly_gpu.yml
deleted file mode 100644
index 3211a87339..0000000000
--- a/.github/workflows/benchmark_nightly_gpu.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: Benchmark torchserve gpu nightly
-
-
-on:
-  # run every day at 2:15am
-  schedule:
-    - cron:  '15 02 * * *'
-
-jobs:
-  nightly:
-    runs-on: [self-hosted, gpu]
-    timeout-minutes: 1320
-    steps:
-      - name: Clean up previous run
-        run: |
-          echo "Cleaning up previous run"
-          cd $RUNNER_WORKSPACE
-          pwd
-          cd ..
-          pwd
-          rm -rf _tool
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-          architecture: x64
-      - name: Setup Java 17
-        uses: actions/setup-java@v3
-        with:
-          distribution: 'zulu'
-          java-version: '17'
-      - name: Checkout TorchServe
-        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: |
-            sudo apt-get update -y
-            sudo apt-get install -y apache2-utils
-            pip install -r benchmarks/requirements-ab.txt
-      - name: Benchmark gpu nightly
-        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_gpu.yaml --skip false
-      - name: Save benchmark artifacts
-        uses: actions/upload-artifact@v2
-        with:
-          name: nightly gpu artifact
-          path: /tmp/ts_benchmark
diff --git a/.github/workflows/ci_cpu.yml b/.github/workflows/ci_cpu.yml
index 39b369ad4a..1f7fd98ad6 100644
--- a/.github/workflows/ci_cpu.yml
+++ b/.github/workflows/ci_cpu.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04, macOS-latest]
+        os: [ubuntu-20.04, macOS-latest]
     steps:
       - name: Setup Python 3.8
         uses: actions/setup-python@v4
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
index 17e55f1690..e43e2b92f2 100644
--- a/.github/workflows/ci_gpu.yml
+++ b/.github/workflows/ci_gpu.yml
@@ -38,7 +38,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install dependencies
         run: |
-          python ts_scripts/install_dependencies.py --environment=dev --cuda=cu102
+          python ts_scripts/install_dependencies.py --environment=dev --cuda=cu117
       - name: Torchserve Sanity
         uses: nick-fields/retry@v2
         with:
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000..cbb601699f
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,65 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  # run every day at 11:15am
+  schedule:
+    - cron:  '15 11 * * *'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'java', 'python' ]
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      
+    - name: Setup Python 3.8
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+        architecture: x64
+    - name: Setup Java 17
+      uses: actions/setup-java@v3
+      with:
+        distribution: 'zulu'
+        java-version: '17'
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+
+
+    - name: Build TorchServe frontend
+      run: |
+        python ts_scripts/install_dependencies.py --environment=dev
+        cd frontend
+        ./gradlew build -x test
+        cd ..
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
new file mode 100644
index 0000000000..a95f0da1f1
--- /dev/null
+++ b/.github/workflows/docker-ci.yaml
@@ -0,0 +1,42 @@
+name: Docker CI
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  test-build-and-container:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Test build_image.sh script with custom tagging and gpu flag
+        working-directory: docker
+        run: ./test_build_image_tagging.sh ${{ matrix.python-version }}
+
+      - name: Build Image for container test
+        id: image_build
+        working-directory: docker
+        run: |
+          IMAGE_TAG=test-image-${{ matrix.python-version }}
+          ./build_image.sh -py "${{ matrix.python-version }}" -t "${IMAGE_TAG}"
+          echo "IMAGE_TAG=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+
+      - name: Container Healthcheck
+        working-directory: docker
+        run: ./test_container_health.sh ${{ steps.image_build.outputs.IMAGE_TAG }}
+
+      - name: Check Python version in container
+        working-directory: docker
+        run: ./test_container_python_version.sh ${{ steps.image_build.outputs.IMAGE_TAG }} ${{ matrix.python-version }}
+
+      - name: Test model running in container with sample image data 
+        working-directory: docker
+        run: |
+          ./test_container_model_prediction.sh ${{ steps.image_build.outputs.IMAGE_TAG }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6d9b336d6b..a13152b59a 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -77,6 +77,7 @@ jobs:
           echo "cd serve/"
           echo "pre-commit install"
           echo "pre-commit will lint your code for you, so git add and commit those new changes and this check should become green"
+          echo "If you've already pushed some files remotely then run git diff --name-only main | xargs pre-commit run --files"
 
   spellcheck:
     runs-on: ubuntu-20.04
diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
index 58247119ab..33f03eb54e 100644
--- a/.github/workflows/regression_tests_cpu.yml
+++ b/.github/workflows/regression_tests_cpu.yml
@@ -1,6 +1,16 @@
 name: Run Regression Tests on CPU
 
-on: workflow_dispatch
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   regression-cpu:
diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
index 6a16fce8f7..ff5a2bc8ea 100644
--- a/.github/workflows/regression_tests_gpu.yml
+++ b/.github/workflows/regression_tests_gpu.yml
@@ -1,15 +1,21 @@
 name: Run Regression Tests on GPU
 
-on: workflow_dispatch
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   regression-gpu:
     # creates workflows for CUDA 11.6 & CUDA 11.7 on ubuntu
     runs-on: [self-hosted, regression-test-gpu]
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda: ["cu116", "cu117"]
     steps:
       - name: Clean up previous run
         run: |
@@ -37,7 +43,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install dependencies
         run: |
-          python ts_scripts/install_dependencies.py --environment=dev --cuda=${{ matrix.cuda }}
+          python ts_scripts/install_dependencies.py --environment=dev --cuda=cu117
       - name: Torchserve Regression Tests
         run: |
           python test/regression_tests.py
diff --git a/.gitignore b/.gitignore
index 236cc35cae..6650bdac05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ dist/
 *.egg-info/
 .idea
 *htmlcov*
+.cache
 .coverage
 .github/actions/
 .github/.DS_Store
@@ -18,6 +19,7 @@ plugins/*/bin
 *.backup
 docs/sphinx/src/
 ts_scripts/spellcheck_conf/wordlist.dic
+venv/
 
 # Postman files
 test/artifacts/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a7ee04a103..ec9f575678 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.0.1
+    rev: v4.4.0
     hooks:
       - id: check-ast
       - id: check-builtin-literals
@@ -18,23 +18,23 @@ repos:
       - id: check-vcs-permalinks
       - id: check-shebang-scripts-are-executable
   - repo: https://github.com/pre-commit/pygrep-hooks
-    rev: v1.9.0
+    rev: v1.10.0
     hooks:
       - id: python-check-mock-methods
       - id: python-no-log-warn
       - id: python-use-type-annotations
   - repo: https://github.com/hadialqattan/pycln
-    rev: v1.2.5
+    rev: v2.1.3
     hooks:
       - id: pycln
         args: [--all]
   - repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 23.1.0
     hooks:
       - id: black
         additional_dependencies: ['click==8.0.4']
   - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+    rev: 5.12.0
     hooks:
       - id: isort
         args: ["--profile", "black"]
diff --git a/README.md b/README.md
index b08f9c0323..529c61383a 100644
--- a/README.md
+++ b/README.md
@@ -94,10 +94,11 @@ To learn more about how to contribute, see the contributor guide [here](https://
 
 ## 📰 News
 * [Torchserve Performance Tuning, Animated Drawings Case-Study](https://pytorch.org/blog/torchserve-performance-tuning/)
-* [Walmart Search: Serving Models at a Scale on TorchServe](https://pytorch.s3.amazonaws.com/posters/ptc2022/D03.pdf)
+* [Walmart Search: Serving Models at a Scale on TorchServe](https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d)
 * [🎥 Scaling inference on CPU with TorchServe](https://www.youtube.com/watch?v=066_Jd6cwZg)
 * [🎥 TorchServe C++ backend](https://www.youtube.com/watch?v=OSmGGDpaesc)
 * [Grokking Intel CPU PyTorch performance from first principles: a TorchServe case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html)
+* [Grokking Intel CPU PyTorch performance from first principles( Part 2): a TorchServe case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex_2.html)
 * [Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing](https://pytorch.org/blog/amazon-ads-case-study/)
 * [Optimize your inference jobs using dynamic batch inference with TorchServe on Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/optimize-your-inference-jobs-using-dynamic-batch-inference-with-torchserve-on-amazon-sagemaker/)
 * [Using AI to bring children's drawings to life](https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/)
diff --git a/SECURITY.md b/SECURITY.md
index f373f0c80a..406d286ee5 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,9 +4,17 @@
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 0.5.3   | :white_check_mark: |
+| 0.7.1   | :white_check_mark: |
+
+
+## How we do security
+
+TorchServe as much as possible relies on automated tools to do security scanning, in particular we support
+1. Dependency Analysis: Using Dependabot
+2. Docker Scanning: Using Snyk
+3. Code Analysis: Using CodeQL
 
 
 ## Reporting a Vulnerability
 
-If you find a serious vulnerability please report it to opensource@fb.com and torchserve@amazon.com
+If you find a serious vulnerability please report it to opensource@meta.com and torchserve@amazon.com
diff --git a/benchmarks/README.md b/benchmarks/README.md
index b935844de6..d17875418f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -76,6 +76,7 @@ The following parameters can be used to run the AB benchmark suite.
 - ts: Use Already running Torchserve instance. Default: False
 - gpus: Number of gpus to run docker container with. By default it runs the docker container on CPU.
 - backend_profiling: Enable backend profiling using CProfile. Default: False
+- generate_graphs: Enable generation of Graph plots. Default False
 - config_properties: Path to config.properties file. Default: config.properties in the benchmark directory
 - inference_model_url: Inference function url - can be either for predictions or explanations. Default: predictions/benchmark.
 - config: All the above params can be set using a config JSON file. When this flag is used, all other cmd line params are ignored.
@@ -127,7 +128,7 @@ python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar
 * TORCHSERVE SERVING PREDICTIONS WITH DOCKER
 
 ```
-python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar --content_type application/png --config_properties config.properties --inference_model_url predictions/benchmark --input ../examples/image_classifier/mnist/test_data/0.png --exec_env docker 
+python benchmark-ab.py --url https://torchserve.pytorch.org/mar_files/mnist.mar --content_type application/png --config_properties config.properties --inference_model_url predictions/benchmark --input ../examples/image_classifier/mnist/test_data/0.png --exec_env docker
 ```
 
 ### Test plans
@@ -136,7 +137,7 @@ Benchmark supports pre-defined, pre-configured params that can be selected based
 2. vgg11_1000r_10c: vgg11 model with requests =1000 and concurrency=10
 3. vgg11_10000r_100c: vgg11 model with requests =10000 and concurrency=100
 4. resnet152_batch: Resnet-152 model with batch size = 4, requests =1000 and concurrency=10
-5. resnet152_batch_docker: Resnet-152 model with batch size = 4, requests =1000, concurrency=10 and execution env = docker 
+5. resnet152_batch_docker: Resnet-152 model with batch size = 4, requests =1000, concurrency=10 and execution env = docker
 
 Note: These pre-defined parameters in test plan can be overwritten by cmd line args.
 
@@ -209,11 +210,11 @@ python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_template
 ```
 
 ## Github Actions benchmarking
-If you need to run your benchmarks on a specific cloud or hardware infrastructure. We highly recommend you fork this repo and leverage the benchmarks in `.github/workflows/benchmark-nightly_cpu*.yml` which will run the benchmarks on a custom instance of your choice and save the results as a github artifact. To learn more about how to create your own custom runner by following instructions from Github here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners
+If you need to run your benchmarks on a specific cloud or hardware infrastructure. We highly recommend you fork this repo and leverage the benchmarks in `.github/workflows/benchmark_nightly.yml` which will run the benchmarks on a custom instance of your choice and save the results as a github artifact. To learn more about how to create your own custom runner by following instructions from Github here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners
 
 The high level approach
 1. Create a cloud instance in your favorite cloud provider
 2. Configure it so it can talk to github actions by running some shell commands listed here https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners
 3. Tag your instances in the runners tab on Github
 3. In the `.yml` make sure to use `runs-on [self-hosted, your_tag]`
-4. Inspect the results in https://github.com/pytorch/serve/actions and download the artifacts for further analysis
\ No newline at end of file
+4. Inspect the results in https://github.com/pytorch/serve/actions and download the artifacts for further analysis
diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
index 7918cfdd0f..d7bf07f062 100644
--- a/benchmarks/auto_benchmark.py
+++ b/benchmarks/auto_benchmark.py
@@ -17,9 +17,10 @@
 
 
 class BenchmarkConfig:
-    def __init__(self, yaml_dict, skip_ts_install):
+    def __init__(self, yaml_dict, skip_ts_install, skip_upload):
         self.yaml_dict = yaml_dict
         self.skip_ts_install = skip_ts_install
+        self.skip_upload = skip_upload
         self.bm_config = {}
         yesterday = datetime.date.today() - datetime.timedelta(days=1)
         self.bm_config["version"] = "torchserve-nightly=={}.{}.{}".format(
@@ -89,15 +90,15 @@ def load_config(self):
                 self.models(v)
             elif k == "hardware":
                 self.hardware(v)
-            elif k == "metrics_cmd":
+            elif k == "metrics_cmd" and not self.skip_upload:
                 self.metrics_cmd(v)
-            elif k == "report_cmd":
+            elif k == "report_cmd" and not self.skip_upload:
                 report_cmd = v
 
         self.bm_config["model_config_path"] = (
-            "{}/cpu".format(MODEL_JSON_CONFIG_PATH)
-            if self.bm_config["hardware"] == "cpu"
-            else "{}/gpu".format(MODEL_JSON_CONFIG_PATH)
+            "{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"])
+            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron", "neuronx"]
+            else "{}/cpu".format(MODEL_JSON_CONFIG_PATH)
         )
 
         if self.skip_ts_install:
@@ -110,12 +111,12 @@ def load_config(self):
             print("{}={}".format(k, v))
 
 
-def load_benchmark_config(bm_config_path, skip_ts_install):
+def load_benchmark_config(bm_config_path, skip_ts_install, skip_upload):
     yaml = ruamel.yaml.YAML()
     with open(bm_config_path, "r") as f:
         yaml_dict = yaml.load(f)
 
-        benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install)
+        benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install, skip_upload)
         benchmark_config.load_config()
 
     return benchmark_config.bm_config
@@ -125,6 +126,7 @@ def benchmark_env_setup(bm_config, skip_ts_install):
     install_torchserve(skip_ts_install, bm_config["hardware"], bm_config["version"])
     setup_benchmark_path(bm_config["model_config_path"])
     build_model_json_config(bm_config["models"])
+    enable_launcher_with_logical_core(bm_config["hardware"])
 
 
 def install_torchserve(skip_ts_install, hw, ts_version):
@@ -178,6 +180,13 @@ def build_model_json_config(models):
         gen_model_config_json.convert_yaml_to_json(input_file, MODEL_JSON_CONFIG_PATH)
 
 
+def enable_launcher_with_logical_core(hw):
+    if hw == "cpu":
+        with open("./benchmarks/config.properties", "a") as f:
+            f.write("cpu_launcher_enable=true\n")
+            f.write("cpu_launcher_args=--use_logical_core\n")
+
+
 def run_benchmark(bm_config):
     files = os.listdir(bm_config["model_config_path"])
     files.sort()
@@ -277,6 +286,10 @@ def main():
         action="store",
         help="true: skip torchserve installation. default: true",
     )
+    parser.add_argument(
+        "--skip_upload",
+        help="true: skip uploading commands . default: false",
+    )
 
     arguments = parser.parse_args()
     skip_ts_config = (
@@ -284,7 +297,12 @@ def main():
         if arguments.skip is not None and arguments.skip.lower() == "false"
         else True
     )
-    bm_config = load_benchmark_config(arguments.input, skip_ts_config)
+    skip_upload = (
+        True
+        if arguments.skip_upload is not None and arguments.skip_upload.lower() == "true"
+        else False
+    )
+    bm_config = load_benchmark_config(arguments.input, skip_ts_config, skip_upload)
     benchmark_env_setup(bm_config, skip_ts_config)
     run_benchmark(bm_config)
     clean_up_benchmark_env(bm_config)
diff --git a/benchmarks/benchmark-ab.py b/benchmarks/benchmark-ab.py
index 896282c41c..097ab1e985 100644
--- a/benchmarks/benchmark-ab.py
+++ b/benchmarks/benchmark-ab.py
@@ -30,6 +30,7 @@
     "image": "",
     "docker_runtime": "",
     "backend_profiling": False,
+    "generate_graphs": False,
     "config_properties": "config.properties",
     "inference_model_url": "predictions/benchmark",
     "report_location": tempfile.gettempdir(),
@@ -94,6 +95,12 @@ def json_provider(file_path, cmd_name):
     default=False,
     help="Enable backend profiling using CProfile. Default False",
 )
+@click.option(
+    "--generate_graphs",
+    "-gg",
+    default=False,
+    help="Enable generation of Graph plots. Default False",
+)
 @click.option(
     "--config_properties",
     "-cp",
@@ -140,6 +147,7 @@ def benchmark(
     inference_model_url,
     report_location,
     tmp_dir,
+    generate_graphs,
 ):
     input_params = {
         "url": url,
@@ -159,6 +167,7 @@ def benchmark(
         "inference_model_url": inference_model_url,
         "report_location": report_location,
         "tmp_dir": tmp_dir,
+        "generate_graphs": generate_graphs,
     }
 
     # set ab params
@@ -441,8 +450,9 @@ def generate_report(warm_up_lines):
     click.secho("\n\nGenerating Reports...", fg="green")
     extract_metrics(warm_up_lines=warm_up_lines)
     generate_csv_output()
-    generate_latency_graph()
-    generate_profile_graph()
+    if execution_params["generate_graphs"]:
+        generate_latency_graph()
+        generate_profile_graph()
     click.secho("\nTest suite execution complete.", fg="green")
 
 
diff --git a/benchmarks/benchmark_config_cpu.yaml b/benchmarks/benchmark_config_cpu.yaml
index 42d14a3dde..008d784204 100644
--- a/benchmarks/benchmark_config_cpu.yaml
+++ b/benchmarks/benchmark_config_cpu.yaml
@@ -13,7 +13,7 @@ models:
     - "fastrcnn.yaml"
     - "mnist.yaml"
     - "vgg16.yaml"
-    - "wf_dog_breed.yaml"
+#    - "wf_dog_breed.yaml"
 
 # benchmark on "cpu" or "gpu".
 # "cpu" is set if "hardware" is not specified
diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml
index ccbf9f86ce..7e8969945e 100644
--- a/benchmarks/benchmark_config_gpu.yaml
+++ b/benchmarks/benchmark_config_gpu.yaml
@@ -15,7 +15,7 @@ models:
     - "fastrcnn.yaml"
     - "mnist.yaml"
     - "vgg16.yaml"
-    - "wf_dog_breed.yaml"
+#    - "wf_dog_breed.yaml"
 
 # benchmark on "cpu" or "gpu".
 # "cpu" is set if "hardware" is not specified
diff --git a/benchmarks/benchmark_config_neuron.yaml b/benchmarks/benchmark_config_neuron.yaml
new file mode 100644
index 0000000000..38fb76c78d
--- /dev/null
+++ b/benchmarks/benchmark_config_neuron.yaml
@@ -0,0 +1,45 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+  - "bert_neuron.yaml"
+
+# benchmark on "cpu", "gpu" or "neuron".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "neuron"
+
+# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
+# the command line to load prometheus metrics report to remote system.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command definition.
+#    - set up the command before enabling `metrics_cmd`.
+#      For example, aws client and AWS credentials need to be setup before trying this example.
+metrics_cmd:
+  - "cmd": "aws cloudwatch put-metric-data"
+  - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+  - "--region": "us-east-2"
+  - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+
+# load report to remote storage or local different path if "report_cmd" is set.
+# the command line to load report to remote storage.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command.
+#    - set up the command before enabling `report_cmd`.
+#      For example, aws client, AWS credentials and S3 bucket
+#      need to be setup before trying this example.
+#    - "today()" is a keyword to apply current date in the path
+#      For example, the dest path in the following example is
+#      s3://torchserve-model-serving/benchmark/2022-03-18/gpu
+report_cmd:
+  - "cmd": "aws s3 cp --recursive"
+  - "source": '/tmp/ts_benchmark/'
+  - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
diff --git a/benchmarks/benchmark_config_neuronx.yaml b/benchmarks/benchmark_config_neuronx.yaml
new file mode 100644
index 0000000000..b8cb3ecf68
--- /dev/null
+++ b/benchmarks/benchmark_config_neuronx.yaml
@@ -0,0 +1,45 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+  - "bert_neuronx.yaml"
+
+# benchmark on "cpu", "gpu", "neuron" or "neuronx".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "neuronx"
+
+# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
+# the command line to load prometheus metrics report to remote system.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command definition.
+#    - set up the command before enabling `metrics_cmd`.
+#      For example, aws client and AWS credentials need to be setup before trying this example.
+metrics_cmd:
+  - "cmd": "aws cloudwatch put-metric-data"
+  - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+  - "--region": "us-east-2"
+  - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+
+# load report to remote storage or local different path if "report_cmd" is set.
+# the command line to load report to remote storage.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command.
+#    - set up the command before enabling `report_cmd`.
+#      For example, aws client, AWS credentials and S3 bucket
+#      need to be setup before trying this example.
+#    - "today()" is a keyword to apply current date in the path
+#      For example, the dest path in the following example is
+#      s3://torchserve-model-serving/benchmark/2022-03-18/gpu
+report_cmd:
+  - "cmd": "aws s3 cp --recursive"
+  - "source": '/tmp/ts_benchmark/'
+  - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
diff --git a/benchmarks/models_config/bert_neuron.yaml b/benchmarks/models_config/bert_neuron.yaml
index 13d9004a22..be771fb8df 100644
--- a/benchmarks/models_config/bert_neuron.yaml
+++ b/benchmarks/models_config/bert_neuron.yaml
@@ -1,22 +1,68 @@
 ---
-bert_inf1:
+bert_neuron_batch_1:
     scripted_mode:
         benchmark_engine: "ab"
-        compile_per_batch_size: True
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_1.mar
         workers:
             - 4
         batch_delay: 100
         batch_size:
             - 1
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuron"
+
+bert_neuron_batch_2:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_2.mar
+        workers:
+            - 4
+        batch_delay: 100
+        batch_size:
             - 2
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuron"
+
+bert_neuron_batch_4:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_4.mar
+        workers:
+            - 4
+        batch_delay: 100
+        batch_size:
             - 4
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuron"
+
+bert_neuron_batch_8:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuron_batch_8.mar
+        workers:
+            - 4
+        batch_delay: 100
+        batch_size:
             - 8
-        input: "./benchmarks/automated/tests/resources/neuron-bert/input"
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
         requests: 10000
         concurrency: 100
         backend_profiling: False
-        exec_env: "aws_neuron_pytorch_p36"
+        exec_env: "local"
         processors:
-            - "inferentia"
-instance_types:
-    - "inf1.6xlarge"
\ No newline at end of file
+            - "neuron"
diff --git a/benchmarks/models_config/bert_neuronx.yaml b/benchmarks/models_config/bert_neuronx.yaml
new file mode 100644
index 0000000000..b7e4ba46f8
--- /dev/null
+++ b/benchmarks/models_config/bert_neuronx.yaml
@@ -0,0 +1,68 @@
+---
+bert_neuronx_batch_1:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_1.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 1
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_2:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_2.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 2
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_4:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_4.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 4
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_8:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_8.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 8
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
diff --git a/benchmarks/requirements-ab.txt b/benchmarks/requirements-ab.txt
index 6275e7b7d8..ca689c38fe 100644
--- a/benchmarks/requirements-ab.txt
+++ b/benchmarks/requirements-ab.txt
@@ -4,5 +4,5 @@ click-config-file
 matplotlib
 requests
 pyyaml
-mdutils
-ruamel.yaml
\ No newline at end of file
+mdutils==1.4.0
+ruamel.yaml
diff --git a/benchmarks/utils/report.py b/benchmarks/utils/report.py
new file mode 100644
index 0000000000..e5980c2eaa
--- /dev/null
+++ b/benchmarks/utils/report.py
@@ -0,0 +1,75 @@
+import csv
+
+METRICS_VALIDATED = [
+    "TS throughput",
+    "TS latency P50",
+    "TS latency P90",
+    "TS latency P99",
+    "Model_p50",
+    "Model_p90",
+    "Model_p99",
+    "memory_percentage_mean",
+    "gpu_memory_used_mean",
+    "cpu_percentage_mean",
+    "gpu_percentage_mean",
+]
+
+
+# Acceptable metric deviation needs a more complicated logic.
+# Example: For latencies in 2 digits, 50% might be acceptable
+# For 3 digit latencies, 20-30% might be the right value
+# For cpu_memory < 15%, 50% deviation works but for CPU > 40%, 10-15%
+# might be the right value
+ACCEPTABLE_METRIC_DEVIATION = 0.3
+
+
+class Report:
+    def __init__(self, deviation=0, num_reports=0):
+        self.properties = {}
+        self.mode = None
+        self.throughput = 0
+        self.batch_size = 0
+        self.workers = 0
+        self.deviation = deviation
+        self.num_reports = num_reports
+
+    def _get_mode(self, csv_file):
+        cfg = csv_file.split("/")[-2]
+        cfg = cfg.split("_")
+        mode = cfg[0] + "_" + cfg[1]
+        self.mode = mode
+
+    def read_csv(self, csv_file):
+        with open(csv_file, newline="") as f:
+            reader = csv.DictReader(f)
+            for k, v in next(reader).items():
+                if k in METRICS_VALIDATED:
+                    self.properties[k] = float(v)
+        self._get_mode(csv_file)
+
+    def update(self, report):
+        for property in self.properties:
+            # sum the properties to find the mean later
+            self.properties[property] += report.properties[property]
+
+    def mean(self):
+        for k, v in self.properties.items():
+            self.properties[k] = v / self.num_reports
+
+
+def metric_valid(key, obs_val, exp_val, threshold):
+    # In case of throughput, higher is better
+    # In case of memory, lower is better.
+    # We ignore lower values for memory related metrices
+    lower = False
+    if "throughput" not in key:
+        lower = True
+    return check_if_within_threshold(exp_val, obs_val, threshold) or (
+        (obs_val < exp_val and lower) or (obs_val > exp_val and not lower)
+    )
+
+
+def check_if_within_threshold(value1, value2, threshold):
+    if float(value1) == 0.0:
+        return True
+    return abs((value1 - value2) / float(value1)) <= threshold
diff --git a/benchmarks/utils/update_artifacts.py b/benchmarks/utils/update_artifacts.py
new file mode 100644
index 0000000000..6b2dcc4f39
--- /dev/null
+++ b/benchmarks/utils/update_artifacts.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+import shutil
+
+BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark"
+BENCHMARK_ARTIFACTS_PATH = "/tmp/ts_artifacts"
+BENCHMARK_REPORT_FILE = "ab_report.csv"
+WINDOW_LEN = 30
+WINDOW_START = 0
+
+################################################################
+# This is an example directory structure for the artifacts.
+# Here, report_id 1 is missing, new report would be added under 1
+# and we would remove report_id 2.
+# .
+# └── tmp/
+#    └── ts_artifacts/
+#        ├── 0/
+#        │   ├── eager_mode_mnist_w4_b1/
+#        │   │   └── ab_report.csv
+#        │   ├── eager_mode_mnist_w4_b2/
+#        │   │   └── ab_report.csv
+#        │   └── ...
+#        ├── 2/
+#        │   ├── eager_mode_mnist_w4_b1/
+#        │   │   └── ab_report.csv
+#        │   ├── eager_mode_mnist_w4_b2/
+#        │   │   └── ab_report.csv
+#        │   └── ...
+#        ├── 3/
+#        │   ├── eager_mode_mnist_w4_b1/
+#        │   │   └── ab_report.csv
+#        │   ├── eager_mode_mnist_w4_b2/
+#        │   │   └── ab_report.csv
+#        │   └── ...
+#        ├── ...
+#        └── 6/
+#            ├── eager_mode_mnist_w4_b1/
+#            │   └── ab_report.csv
+#            ├── eager_mode_mnist_w4_b2/
+#            │   └── ab_report.csv
+#            └── ...
+################################################################
+
+
+# Copy BENCHMARK_REPORT_FILE to artifacts
+def copy_benchmark_reports(input, output):
+    for dir in os.listdir(input):
+        if os.path.isdir(os.path.join(input, dir)):
+            new_dir = os.path.join(output, dir)
+            os.makedirs(new_dir, exist_ok=True)
+            shutil.copy(os.path.join(input, dir, BENCHMARK_REPORT_FILE), new_dir)
+
+
+# Save new report and delete the oldest report
+def update_new_report(input_dir, output_dir, add_report_id, del_report_id):
+    # Add new report
+    new_dir = os.path.join(output_dir, str(add_report_id))
+    print("Creating artifacts ", new_dir)
+    copy_benchmark_reports(input_dir, new_dir)
+
+    # Remove old report
+    if isinstance(del_report_id, int):
+        rm_dir = os.path.join(output_dir, str(del_report_id % WINDOW_LEN))
+        print("Removing artifacts ", rm_dir)
+        shutil.rmtree(rm_dir, ignore_errors=True)
+
+
+# Create artifacts for a period of rolling WINDOW_LEN-1 reports
+def update_artifacts(input_dir, output_dir):
+    # Create a drectory where artifacts will be stored
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the sorted list of existing report_ids
+    list_dirs = sorted(map(lambda x: int(x), os.listdir(output_dir)))
+    num_reports = len(list_dirs)
+
+    # Initial case: When they are less than WINDOW_LEN-1 reports
+    if num_reports < WINDOW_LEN - 1:
+        add_report_id, del_report_id = num_reports, None
+        update_new_report(input_dir, output_dir, add_report_id, del_report_id)
+        return
+
+    # When there are WINDOW_LEN - 1 reports and we want to add the new report
+    # and remove the oldest report
+    for i, report_id in enumerate(list_dirs):
+        if i != report_id or (i + 1 == WINDOW_LEN - 1):
+            if i != report_id:
+                # When  report_id has a missing element in sequence
+                add_report_id, del_report_id = i, report_id
+            else:
+                # When report_id WINDOW_LEN-1 is missing
+                add_report_id, del_report_id = i + 1, (i + 2) % WINDOW_LEN
+            update_new_report(input_dir, output_dir, add_report_id, del_report_id)
+            break
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input_dir",
+        nargs="?",
+        help="the dir of a list of model benchmark result subdir ",
+        const=BENCHMARK_REPORT_PATH,
+        type=str,
+        default=BENCHMARK_REPORT_PATH,
+    )
+
+    parser.add_argument(
+        "--output_dir",
+        nargs="?",
+        help="the dir of model benchmark artifacts ",
+        const=BENCHMARK_ARTIFACTS_PATH,
+        type=str,
+        default=BENCHMARK_ARTIFACTS_PATH,
+    )
+
+    args = parser.parse_args()
+
+    update_artifacts(args.input_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/validate_report.py b/benchmarks/validate_report.py
new file mode 100644
index 0000000000..968ed456ae
--- /dev/null
+++ b/benchmarks/validate_report.py
@@ -0,0 +1,95 @@
+import argparse
+import os
+
+from utils.report import (
+    ACCEPTABLE_METRIC_DEVIATION,
+    METRICS_VALIDATED,
+    Report,
+    metric_valid,
+)
+from utils.update_artifacts import (
+    BENCHMARK_ARTIFACTS_PATH,
+    BENCHMARK_REPORT_FILE,
+    BENCHMARK_REPORT_PATH,
+)
+
+
+def validate_reports(artifacts_dir, report_dir, deviation):
+    # Read baseline reports
+    baseline_reports = {}
+    num_reports = len(os.listdir(artifacts_dir))
+    for _d in sorted(os.listdir(artifacts_dir)):
+        dir = os.path.join(artifacts_dir, _d)
+        for subdir in sorted(os.listdir(dir)):
+            csv_file = os.path.join(dir, subdir, BENCHMARK_REPORT_FILE)
+
+            report = Report(deviation, num_reports)
+            report.read_csv(csv_file)
+            if subdir not in baseline_reports:
+                baseline_reports[subdir] = report
+            else:
+                baseline_reports[subdir].update(report)
+
+    # Get the mean value each of the properties for every report
+    for model, report in baseline_reports.items():
+        report.mean()
+        baseline_reports[model] = report
+
+    # Read generated reports
+    generated_reports = {}
+    for subdir in sorted(os.listdir(report_dir)):
+        if os.path.isdir(os.path.join(report_dir, subdir)):
+            csv_file = os.path.join(report_dir, subdir, BENCHMARK_REPORT_FILE)
+            report = Report()
+            report.read_csv(csv_file)
+            generated_reports[subdir] = report
+
+    # Compare generated reports with baseline reports
+    error = False
+    for model, report in generated_reports.items():
+        for key in METRICS_VALIDATED:
+            if not metric_valid(
+                key,
+                report.properties[key],
+                baseline_reports[model].properties[key],
+                baseline_reports[model].deviation,
+            ):
+                print(
+                    f"Error while validating {key} for model: {model}, "
+                    f"Expected value: {baseline_reports[model].properties[key]:.2f}, "
+                    f"Observed value: {report.properties[key]:.2f}"
+                )
+                error = True
+        if not error:
+            print(f"Model {model} successfully validated")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input-artifacts-dir",
+        help="directory where benchmark artifacts have been saved",
+        type=str,
+        default=BENCHMARK_ARTIFACTS_PATH,
+    )
+
+    parser.add_argument(
+        "--input-report-dir",
+        help="directory where current benchmark report is saved",
+        type=str,
+        default=BENCHMARK_REPORT_PATH,
+    )
+
+    parser.add_argument(
+        "--deviation",
+        help="acceptable variation in metrics values ",
+        type=float,
+        default=ACCEPTABLE_METRIC_DEVIATION,
+    )
+    args = parser.parse_args()
+    validate_reports(args.input_artifacts_dir, args.input_report_dir, args.deviation)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 498def3576..333f585047 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -18,35 +18,42 @@
 
 ARG BASE_IMAGE=ubuntu:rolling
 
+# Note:
+# Define here the default python version to be used in all later build-stages as default.
+# ARG and ENV variables do not persist across stages (they're build-stage scoped).
+# That is crucial for ARG PYTHON_VERSION, which otherwise becomes "" leading to nasty bugs,
+# that don't let the build fail, but break current version handling logic and result
+# in images with wrong python version. To fix that, we will restate the ARG PYTHON_VERSION
+# on each build-stage.
+ARG PYTHON_VERSION=3.9
+
 FROM ${BASE_IMAGE} AS compile-image
 ARG BASE_IMAGE=ubuntu:rolling
+ARG PYTHON_VERSION
 ENV PYTHONUNBUFFERED TRUE
 
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
-    #apt --fix-broken -y install && \
     apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
     apt remove python-pip  python3-pip && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
-    ca-certificates \
-    g++ \
-    python3.8 \
-    python3.8-dev \
-    python3.8-distutils \
-    python3.8-venv \
-    python3-venv \
-    openjdk-17-jdk \
-    curl \
-    && rm -rf /var/lib/apt/lists/* \
-    && cd /tmp \
-    && curl -O https://bootstrap.pypa.io/get-pip.py \
-    && python3.8 get-pip.py
-
-
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 \
-    && update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3.8 1
-
-RUN python3.8 -m venv /home/venv
-
+        ca-certificates \
+        g++ \
+        python3-distutils \
+        python$PYTHON_VERSION \
+        python$PYTHON_VERSION-dev \
+        python$PYTHON_VERSION-venv \
+        openjdk-17-jdk \
+        curl \
+        git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Make the virtual environment and "activating" it by adding it first to the path.
+# From here on the python$PYTHON_VERSION interpreter is used and the packages
+# are installed in /home/venv which is what we need for the "runtime-image"
+RUN python$PYTHON_VERSION -m venv /home/venv
 ENV PATH="/home/venv/bin:$PATH"
 
 RUN python -m pip install -U pip setuptools
@@ -56,34 +63,44 @@ RUN export USE_CUDA=1
 
 ARG CUDA_VERSION=""
 
-RUN TORCH_VER=$(curl --silent --location https://pypi.org/pypi/torch/json | python -c "import sys, json, pkg_resources; releases = json.load(sys.stdin)['releases']; print(sorted(releases, key=pkg_resources.parse_version)[-1])") && \
-    TORCH_VISION_VER=$(curl --silent --location https://pypi.org/pypi/torchvision/json | python -c "import sys, json, pkg_resources; releases = json.load(sys.stdin)['releases']; print(sorted(releases, key=pkg_resources.parse_version)[-1])") && \
+RUN git clone --depth 1 https://github.com/pytorch/serve.git
+
+WORKDIR "serve"
+
+RUN \
     if echo "$BASE_IMAGE" | grep -q "cuda:"; then \
         # Install CUDA version specific binary when CUDA version is specified as a build arg
         if [ "$CUDA_VERSION" ]; then \
-            python -m pip install --no-cache-dir torch==$TORCH_VER+$CUDA_VERSION torchvision==$TORCH_VISION_VER+$CUDA_VERSION -f https://download.pytorch.org/whl/torch_stable.html; \
-        # Install the binary with the latest CUDA version support
+            python ./ts_scripts/install_dependencies.py --cuda $CUDA_VERSION; \
+        # Install the binary with the latest CPU image on a CUDA base image
         else \
-            python -m pip install --no-cache-dir torch torchvision; \
+            python ./ts_scripts/install_dependencies.py; \
         fi; \
-        python -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/master/requirements/common.txt; \
     # Install the CPU binary
     else \
-        python -m pip install --no-cache-dir torch==$TORCH_VER+cpu torchvision==$TORCH_VISION_VER+cpu -f https://download.pytorch.org/whl/torch_stable.html; \
+        python ./ts_scripts/install_dependencies.py; \
     fi
-RUN python -m pip install -U setuptools && python -m pip install --no-cache-dir captum torchtext torchserve torch-model-archiver
+
+# Make sure latest version of torchserve is uploaded before running this
+RUN python -m pip install --no-cache-dir torchserve torch-model-archiver torch-workflow-archiver
 
 # Final image for production
 FROM ${BASE_IMAGE} AS runtime-image
-
+# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top)
+ARG PYTHON_VERSION
 ENV PYTHONUNBUFFERED TRUE
 
 RUN --mount=type=cache,target=/var/cache/apt \
     apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt remove python-pip  python3-pip && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
-    python3.8 \
-    python3.8-distutils \
-    python3.8-dev \
+    python$PYTHON_VERSION \
+    python3-distutils \
+    python$PYTHON_VERSION-dev \
+    python$PYTHON_VERSION-venv \
     # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package
     # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905
     openjdk-17-jdk \
diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
index 0ffcbfa3ca..3d26311f36 100644
--- a/docker/Dockerfile.dev
+++ b/docker/Dockerfile.dev
@@ -1,13 +1,13 @@
 # syntax = docker/dockerfile:experimental
 #
 # Following comments have been shamelessly copied from https://github.com/pytorch/pytorch/blob/master/Dockerfile
-# 
+#
 # NOTE: To build this you will need a docker version > 18.06 with
 #       experimental enabled and DOCKER_BUILDKIT=1
 #
 #       If you do not use buildkit you are not going to have a good time
 #
-#       For reference: 
+#       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
 
 ARG BASE_IMAGE=ubuntu:rolling
@@ -22,11 +22,16 @@ ARG CUDA_VERSION
 ARG BUILD_WITH_IPEX
 ARG IPEX_VERSION=1.11.0
 ARG IPEX_URL=https://software.intel.com/ipex-whl-stable
+ARG PYTHON_VERSION=3.9
+
 
 ENV PYTHONUNBUFFERED TRUE
 
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
     DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
     fakeroot \
     ca-certificates \
@@ -34,38 +39,45 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     sudo \
     g++ \
     git \
-    python3.8 \
-    python3.8-dev \
-    python3.8-venv \
-    python3.8-distutils \
+    python$PYTHON_VERSION \
+    python$PYTHON_VERSION-dev \
+    python$PYTHON_VERSION-venv \
+    python$PYTHON_VERSION-distutils \
    # python3-venv \
     build-essential \
     openjdk-17-jdk \
     curl \
     vim \
-    numactl \ 
+    numactl \
+    openssh-client \
     && if [ "$BUILD_WITH_IPEX" = "true" ]; then apt-get update && apt-get install -y libjemalloc-dev libgoogle-perftools-dev libomp-dev && ln -s /usr/lib/x86_64-linux-gnu/libjemalloc.so /usr/lib/libjemalloc.so && ln -s /usr/lib/x86_64-linux-gnu/libtcmalloc.so /usr/lib/libtcmalloc.so && ln -s /usr/lib/x86_64-linux-gnu/libiomp5.so /usr/lib/libiomp5.so; fi \
     && rm -rf /var/lib/apt/lists/* \
     && cd /tmp \
     && curl -O https://bootstrap.pypa.io/get-pip.py \
-    && python3.8 get-pip.py
+    && python$PYTHON_VERSION get-pip.py
+
+RUN mkdir -p /root/.ssh && \
+    chmod 0700 /root/.ssh && \
+    ssh-keyscan github.com > /root/.ssh/known_hosts
 
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 \
-    && update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3.8 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python$PYTHON_VERSION 1 \
+    && update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3 1
 
 # Build Dev Image
 FROM compile-image AS dev-image
 ARG MACHINE_TYPE=cpu
 ARG CUDA_VERSION
-RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
-    && git clone https://github.com/pytorch/serve.git \
-    && cd serve \
+RUN --mount=type=ssh if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \
+    && git clone git@github.com:textshuttle/pytorch-serve.git \
+    && cd pytorch-serve \
     && git checkout ${BRANCH_NAME} \
-    && python3.8 -m venv /home/venv \
-    && python -m pip install -U pip setuptools \
-    && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \ 
+    && python$PYTHON_VERSION -m venv /home/venv
+ENV PATH="/home/venv/bin:$PATH"
+WORKDIR pytorch-serve
+RUN python -m pip install -U pip setuptools \
+    && if [ -z "$CUDA_VERSION" ]; then python ts_scripts/install_dependencies.py --environment=dev; else python ts_scripts/install_dependencies.py --environment=dev  --cuda $CUDA_VERSION; fi \
     && if [ "$BUILD_WITH_IPEX" = "true" ]; then python -m pip install --no-cache-dir intel_extension_for_pytorch==${IPEX_VERSION} -f ${IPEX_URL}; fi \
-    && python ts_scripts/install_from_src.py \
+    && python ts_scripts/install_from_src.py --git-branch $BRANCH_NAME \
     && useradd -m model-server \
     && mkdir -p /home/model-server/tmp \
     && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \
diff --git a/docker/README.md b/docker/README.md
index 35ffc18b43..fc65749532 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -36,9 +36,10 @@ Use `build_image.sh` script to build the docker images. The script builds the `p
 |-g, --gpu|Build image with GPU based ubuntu base image|
 |-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, codebuild|
 |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.|
-|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`. Default `cu117`|
+|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. Default `cu117`|
 |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.|
 |--codebuild| Set if you need [AWS CodeBuild](https://aws.amazon.com/codebuild/)|
+|-py, --pythonversion| Specify the python version to use. Supported values `3.8`, `3.9`, `3.10`. Default `3.9`|
 
 
 **PRODUCTION ENVIRONMENT IMAGES**
@@ -303,3 +304,7 @@ docker run --rm --shm-size=1g \
         -p7071:7071 \
         --mount type=bind,source=/path/to/model/store,target=/tmp/models <container> torchserve --model-store=/tmp/models
 ```
+
+# Example showing serving model using Docker container
+
+[This](../examples/image_classifier/mnist/Docker.md) is an example showing serving MNIST model using Docker.
diff --git a/docker/build_custom_images.sh b/docker/build_custom_images.sh
new file mode 100755
index 0000000000..dcebdd06c3
--- /dev/null
+++ b/docker/build_custom_images.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# build 23mt-cpu
+./build_image.sh -bt dev -b torchserve-23mt -t textshuttle/pytorch-serve:23mt-cpu
+
+# build 23mt-gpu
+./build_image.sh -bt dev -g -cv cu113 -b torchserve-23mt -t textshuttle/pytorch-serve:23mt-gpu
diff --git a/docker/build_image.sh b/docker/build_image.sh
index 0e8494e953..64f22252b0 100755
--- a/docker/build_image.sh
+++ b/docker/build_image.sh
@@ -1,16 +1,17 @@
 #!/bin/bash
 
+set -o errexit -o nounset -o pipefail
+
 MACHINE=cpu
 BRANCH_NAME="master"
 DOCKER_TAG="pytorch/torchserve:latest-cpu"
 BUILD_TYPE="production"
-DOCKER_FILE="Dockerfile"
 BASE_IMAGE="ubuntu:20.04"
-CUSTOM_TAG=false
+USE_CUSTOM_TAG=false
 CUDA_VERSION=""
-UBUNTU_VERSION="ubuntu:20.04"
 USE_LOCAL_SERVE_FOLDER=false
 BUILD_WITH_IPEX=false
+PYTHON_VERSION=3.9
 
 for arg in "$@"
 do
@@ -25,6 +26,7 @@ do
           echo "-t, --tag specify tag name for docker image"
           echo "-lf, --use-local-serve-folder specify this option for the benchmark image if the current 'serve' folder should be used during automated benchmarks"
           echo "-ipex, --build-with-ipex specify to build with intel_extension_for_pytorch"
+          echo "-py, --pythonversion specify to python version to use: Possible values: 3.8 3.9 3.10"
           exit 0
           ;;
         -b|--branch_name)
@@ -51,8 +53,8 @@ do
           shift
           ;;
         -t|--tag)
-          DOCKER_TAG="$2"
-          CUSTOM_TAG=true
+          CUSTOM_TAG="$2"
+          USE_CUSTOM_TAG=true
           shift
           shift
           ;;
@@ -64,28 +66,42 @@ do
           BUILD_WITH_IPEX=true
           shift
           ;;
+        -py|--pythonversion)
+          PYTHON_VERSION="$2"
+          if [[ $PYTHON_VERSION = 3.8 || $PYTHON_VERSION = 3.9 || $PYTHON_VERSION = 3.10 ]]; then
+            echo "Valid python version"
+          else
+            echo "Valid python versions are 3.8, 3.9 and 3.10"
+            exit 1
+          fi
+          shift
+          shift
+          ;;
         # With default ubuntu version 20.04
         -cv|--cudaversion)
           CUDA_VERSION="$2"
-          if [ $CUDA_VERSION == "cu117" ];
+          if [ "${CUDA_VERSION}" == "cu118" ];
+          then
+            BASE_IMAGE="nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04"
+          elif [ "${CUDA_VERSION}" == "cu117" ];
           then
             BASE_IMAGE="nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04"
-          elif [ $CUDA_VERSION == "cu116" ];
+          elif [ "${CUDA_VERSION}" == "cu116" ];
           then
             BASE_IMAGE="nvidia/cuda:11.6.0-cudnn8-runtime-ubuntu20.04"
-          elif [ $CUDA_VERSION == "cu113" ];
+          elif [ "${CUDA_VERSION}" == "cu113" ];
           then
             BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-runtime-ubuntu20.04"
-          elif [ $CUDA_VERSION == "cu111" ];
+          elif [ "${CUDA_VERSION}" == "cu111" ];
           then
             BASE_IMAGE="nvidia/cuda:11.1.1-cudnn8-runtime-ubuntu20.04"
-          elif [ $CUDA_VERSION == "cu102" ];
+          elif [ "${CUDA_VERSION}" == "cu102" ];
           then
             BASE_IMAGE="nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04"
-          elif [ $CUDA_VERSION == "cu101" ]
+          elif [ "${CUDA_VERSION}" == "cu101" ]
           then
             BASE_IMAGE="nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
-          elif [ $CUDA_VERSION == "cu92" ];
+          elif [ "${CUDA_VERSION}" == "cu92" ];
           then
             BASE_IMAGE="nvidia/cuda:9.2-cudnn7-runtime-ubuntu18.04"
           else
@@ -104,22 +120,27 @@ then
   exit 1
 fi
 
-if [ "${BUILD_TYPE}" == "dev" ] && ! $CUSTOM_TAG ;
+if [ "${BUILD_TYPE}" == "dev" ] && ! $USE_CUSTOM_TAG ;
 then
   DOCKER_TAG="pytorch/torchserve:dev-$MACHINE"
 fi
 
-if [ "${BUILD_TYPE}" == "codebuild" ] && ! $CUSTOM_TAG ;
+if [ "${BUILD_TYPE}" == "codebuild" ] && ! $USE_CUSTOM_TAG ;
 then
   DOCKER_TAG="pytorch/torchserve:codebuild-$MACHINE"
 fi
 
-if [ $BUILD_TYPE == "production" ]
+if [ "$USE_CUSTOM_TAG" = true ]
+then
+  DOCKER_TAG=${CUSTOM_TAG}
+fi
+
+if [ "${BUILD_TYPE}" == "production" ]
 then
-  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg CUDA_VERSION=$CUDA_VERSION -t $DOCKER_TAG .
-elif [ $BUILD_TYPE == "benchmark" ]
+  DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg CUDA_VERSION="${CUDA_VERSION}"  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" .
+elif [ "${BUILD_TYPE}" == "benchmark" ]
 then
-  DOCKER_BUILDKIT=1 docker build --pull --no-cache --file Dockerfile.benchmark --build-arg USE_LOCAL_SERVE_FOLDER=$USE_LOCAL_SERVE_FOLDER --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE -t $DOCKER_TAG .
+  DOCKER_BUILDKIT=1 docker build --pull --no-cache --file Dockerfile.benchmark --build-arg USE_LOCAL_SERVE_FOLDER=$USE_LOCAL_SERVE_FOLDER --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg MACHINE_TYPE="${MACHINE}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" -t "${DOCKER_TAG}" .
 else
-  DOCKER_BUILDKIT=1 docker build --pull --no-cache --file Dockerfile.dev -t $DOCKER_TAG --build-arg BUILD_TYPE=$BUILD_TYPE --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME=$BRANCH_NAME --build-arg CUDA_VERSION=$CUDA_VERSION --build-arg MACHINE_TYPE=$MACHINE --build-arg BUILD_WITH_IPEX=$BUILD_WITH_IPEX .
+  DOCKER_BUILDKIT=1 docker build --ssh default --progress plain --pull --no-cache --file Dockerfile.dev -t "${DOCKER_TAG}" --build-arg BUILD_TYPE="${BUILD_TYPE}" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg CUDA_VERSION="${CUDA_VERSION}" --build-arg MACHINE_TYPE="${MACHINE}" --build-arg BUILD_WITH_IPEX="${BUILD_WITH_IPEX}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}" .
 fi
diff --git a/docker/build_upload_release.py b/docker/build_upload_release.py
index 3bc866f653..fc3d462cb0 100644
--- a/docker/build_upload_release.py
+++ b/docker/build_upload_release.py
@@ -26,11 +26,9 @@
     organization = args.organization
 
     # Upload pytorch/torchserve docker binaries
+    try_and_handle(f"./build_image.sh -t {organization}/torchserve:latest", dry_run)
     try_and_handle(
-        f"./build_image.sh -bt dev -t {organization}/torchserve:latest", dry_run
-    )
-    try_and_handle(
-        f"./build_image.sh -bt dev -g -cv cu117 -t {organization}/torchserve:latest-gpu",
+        f"./build_image.sh -g -cv cu117 -t {organization}/torchserve:latest-gpu",
         dry_run,
     )
     try_and_handle(
diff --git a/docker/docker_nightly.py b/docker/docker_nightly.py
index edc844f950..e35af9c0b3 100644
--- a/docker/docker_nightly.py
+++ b/docker/docker_nightly.py
@@ -32,9 +32,9 @@
     gpu_version = f"{project}:gpu-{get_nightly_version()}"
 
     # Build Nightly images and append the date in the name
-    try_and_handle(f"./build_image.sh -bt dev -t {organization}/{cpu_version}", dry_run)
+    try_and_handle(f"./build_image.sh -t {organization}/{cpu_version}", dry_run)
     try_and_handle(
-        f"./build_image.sh -bt dev -g -cv cu117 -t {organization}/{gpu_version}",
+        f"./build_image.sh -g -cv cu117 -t {organization}/{gpu_version}",
         dry_run,
     )
 
diff --git a/docker/test_build_image_tagging.sh b/docker/test_build_image_tagging.sh
new file mode 100755
index 0000000000..0bd9644d32
--- /dev/null
+++ b/docker/test_build_image_tagging.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+# This test checks the parsing and handling of arguments in `build_image.sh`,
+# making sure that `build_image.sh` is invariant to the order of the passed 
+# arguments `-py` (python version), `-t` (image tag) and `-g` (use gpu flag)
+# and that tagging works properly.
+# That means, we have 3 args, so there are 6 possibilities to order them and
+# we expect these script runs to produce the *very same output*:
+# 
+# $ ./build_image.sh -py "${VERSION}" -t "${TAG}" -g
+# $ ./build_image.sh -py "${VERSION}" -g -t "${TAG}"
+# $ ./build_image.sh -t "${TAG}" -py "${VERSION}" -g
+# $ ./build_image.sh -t "${TAG}" -g -py "${VERSION}" 
+# $ ./build_image.sh -g -py "${VERSION}" -t "${TAG}"
+# $ ./build_image.sh -g -t "${TAG}" -py "${VERSION}"
+# 
+# In order to assert the equivalence of all these variations, we take advantage
+# of how docker builds images: If two images are exactly the same (ie, they are
+# composed of the very same layers) they will have the same digest (ie, a hash 
+# value representing the content of the image), regardless of the tag assigned 
+# to the image. So, for example, if we run (with the same Dockerfile):
+# 
+# $ docker build -f Dockerfile -t Org/Repo:TagOne .
+# $ docker build -f Dockerfile -t Org/Repo:TagTwo .
+# $ docker images --no-trunc
+# 
+# we will see something like this:
+# 
+# REPOSITORY    TAG             IMAGE ID                                                               CREATED        SIZE                                   
+# Org/Repo     TagOne    sha256:e3824d794c0ccf10d2f61291f34e0d7e1e02e30b3d459465bc57d04dd3b65884    30 seconds ago   2.14GB
+# Org/Repo     TagTwo    sha256:e3824d794c0ccf10d2f61291f34e0d7e1e02e30b3d459465bc57d04dd3b65884    30 seconds ago   2.14GB
+# 
+# Notice that IMAGEID and CREATED are the same, since the first time it is 
+# actually created while the second time it just uses the cached layers. 
+# So the tag is "just a label" attached to the underlying image.
+#
+# Putting all together for our test:
+# We run `build_image.sh` (on the same machine to allow docker cache) with each
+# args order variation, tagging each variation with a different name (ensured 
+# by the random part of the string).
+# We expect:
+#   - All the tags to exist (tagging works): len(images_to_test) == len(tags_to_test)
+#   - All tagged images to be actually one and the same under the hood: len(set(digests)) == 1
+
+
+PY_VERSION=$1
+TAG_1="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+TAG_2="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+TAG_3="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+TAG_4="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+TAG_5="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+TAG_6="org/repo:image-${PY_VERSION}-${RANDOM}-${RANDOM}-${RANDOM}-${RANDOM}"
+
+# Do builds alternating the flags order (-g, -t, -py)
+# (which should build only one underlying image)
+./build_image.sh -py "${PY_VERSION}" -t "${TAG_1}" -g
+./build_image.sh -py "${PY_VERSION}" -g -t "${TAG_2}"
+
+./build_image.sh -g -py "${PY_VERSION}" -t "${TAG_3}"
+./build_image.sh -g -t "${TAG_4}" -py "${PY_VERSION}"
+
+./build_image.sh -t "${TAG_5}" -py "${PY_VERSION}" -g 
+./build_image.sh -t "${TAG_6}" -g -py "${PY_VERSION}"
+
+# Collect all the images with their tags and ids
+IMGS_FILE="test_images.json"
+docker images --no-trunc --format "{{json .}}" | jq '{"repo": .Repository, "tag": .Tag, "digest": .ID}' | jq -s > "${IMGS_FILE}"
+
+python <<EOF
+import json
+
+tags_to_test = [
+  "${TAG_1}",
+  "${TAG_2}",
+  "${TAG_3}",
+  "${TAG_4}",
+  "${TAG_5}",
+  "${TAG_6}",
+]
+
+with open("${IMGS_FILE}") as file:
+    images_to_test = [
+        img
+        for img in json.load(file)
+        if f'{img["repo"]}:{img["tag"]}' in tags_to_test
+    ]
+
+if len(images_to_test) == 0:
+    raise ValueError("No images to test were detected")
+
+if len(images_to_test) != len(tags_to_test):
+    raise ValueError(f"number of images_to_test {len(images_to_test)} does not match number of tags_to_test {len(tags_to_test)}")
+
+digests = set(img["digest"] for img in images_to_test)
+
+if len(digests) != 1:
+    raise ValueError(f"There should be only 1 digest, found these: {digests}")
+
+print(f"Test successfull! All flags orders lead to the same image build with digest {digests} \n")
+EOF
+
+rm -f "${IMGS_FILE}"
+docker system prune -f
diff --git a/docker/test_container_health.sh b/docker/test_container_health.sh
new file mode 100755
index 0000000000..0869b757f8
--- /dev/null
+++ b/docker/test_container_health.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+IMAGE_TAG=$1
+CONTAINER="test-container-py${IMAGE_TAG}"
+
+
+healthcheck() {
+    docker run -d --rm -it -p 8080:8080 --name="${CONTAINER}" "${IMAGE_TAG}"
+
+    echo "Waiting 5s for container to come up..."
+    sleep 5
+
+    RESPONSE=$(curl localhost:8080/ping | jq .status)
+    if [ "${RESPONSE}" == '"Healthy"' ]; then
+        echo "Healthcheck succesful! Response from ${CONTAINER}: ${RESPONSE}"
+    else
+        echo "Healthcheck failed! Response from ${CONTAINER}: ${RESPONSE}"
+        exit 1
+    fi
+}
+
+healthcheck
+docker stop "${CONTAINER}"
diff --git a/docker/test_container_model_prediction.sh b/docker/test_container_model_prediction.sh
new file mode 100755
index 0000000000..4e2812bbc0
--- /dev/null
+++ b/docker/test_container_model_prediction.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+IMAGE_TAG=$1
+CONTAINER="test-container-${IMAGE_TAG}"
+
+FILES_PATH="$(realpath "$(pwd)/..")/examples/image_classifier/mnist"
+SERVER_PATH="/home/model-server"
+TEST_ENTRYPOINT="$(pwd)/test-entrypoint.sh"
+cat << EOF > "${TEST_ENTRYPOINT}"
+#!/usr/bin/env bash
+
+torch-model-archiver \
+    --model-name=mnist \
+    --version=1.0 \
+    --model-file=/home/model-server/mnist.py \
+    --serialized-file=/home/model-server/mnist_cnn.pt \
+    --handler=/home/model-server/mnist_handler.py \
+    --export-path=/home/model-server/model-store
+
+torchserve --start --ts-config=/home/model-server/config.properties --models mnist=mnist.mar
+EOF
+
+echo "Starting container ${CONTAINER}"
+docker run --rm -d -it --name "${CONTAINER}" -p 8080:8080 -p 8081:8081 -p 8082:8082 \
+    -v "${FILES_PATH}/mnist.py":"${SERVER_PATH}/mnist.py" \
+    -v "${FILES_PATH}/mnist_cnn.pt":"${SERVER_PATH}/mnist_cnn.pt" \
+    -v "${FILES_PATH}/mnist_handler.py":"${SERVER_PATH}/mnist_handler.py" \
+    -v "${TEST_ENTRYPOINT}":"${SERVER_PATH}/test-entrypoint.sh" \
+    "${IMAGE_TAG}" \
+    /bin/bash test-entrypoint.sh
+
+echo "Waiting 10s for container to come up..."
+sleep 10
+
+assert_expected() {
+    PREDICTION=$1
+    EXPECTED=$2
+    if [ "${PREDICTION}" = "${EXPECTED}" ]; then
+        echo "✓ Prediction: ${PREDICTION} (Expected ${EXPECTED})"
+    else
+        echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}."
+        exit 1
+   fi
+}
+
+echo "Testing classifier with test images in container ${CONTAINER}..."
+for EXPECTED in {0..9}
+do
+    PREDICTION=$(curl -s localhost:8080/predictions/mnist -T "${FILES_PATH}/test_data/${EXPECTED}.png")
+    assert_expected "${PREDICTION}" "${EXPECTED}"
+done
+
+echo "Test successful!"
+
+docker stop "${CONTAINER}"
+rm -f "${TEST_ENTRYPOINT}"
diff --git a/docker/test_container_python_version.sh b/docker/test_container_python_version.sh
new file mode 100755
index 0000000000..5c2fe2a23c
--- /dev/null
+++ b/docker/test_container_python_version.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+IMAGE_TAG=$1
+EXPECTED_VERSION=$2
+
+tmpfile=$(mktemp ./pyversionXXX.txt)
+assert_py_version() {
+    echo "Checking Python version..."
+    docker run --rm -t "${IMAGE_TAG}" exec python --version > "${tmpfile}"
+    if ! grep -q "Python ${EXPECTED_VERSION}" "${tmpfile}"
+    then
+        echo "Test failed: Wrong Python version. Expected ${EXPECTED_VERSION}, got $(cat "${tmpfile}")"
+        exit 1
+    else
+        echo "Test succesful! Found version: $(cat "${tmpfile}")"
+    fi
+}
+
+assert_py_version
+
+rm -f "${tmpfile}"
diff --git a/docs/FAQs.md b/docs/FAQs.md
index e1abc36763..4c9be8a06d 100644
--- a/docs/FAQs.md
+++ b/docs/FAQs.md
@@ -15,8 +15,8 @@ Torchserve API's are compliant with the [OpenAPI specification 3.0](https://swag
 
 ### How to use Torchserve in production?
 Depending on your use case, you will be able to deploy torchserve in production using following mechanisms.
-> Standalone deployment. Refer [TorchServe docker documentation](https://github.com/pytorch/serve/tree/master/docker#readme) or [TorchServe documentation](https://github.com/pytorch/serve/tree/master/docs#readme)
-> Cloud based deployment. Refer [TorchServe kubernetes documentation](https://github.com/pytorch/serve/tree/master/kubernetes#readme) or [TorchServe cloudformation documentation](https://github.com/pytorch/serve/tree/master/examples/cloudformation/README.md)
+> Standalone deployment. Refer [TorchServe docker documentation](https://github.com/pytorch/serve/tree/master/docker#readme) or [TorchServe documentation](README.md)
+> Cloud based deployment. Refer [TorchServe kubernetes documentation](https://github.com/pytorch/serve/tree/master/kubernetes#readme) or [TorchServe cloudformation documentation](https://github.com/pytorch/serve/tree/master/examples/cloudformation/README.md#cloudformation)
 
 
 ### What's difference between Torchserve and a python web app using web frameworks like Flask, Django?
@@ -25,7 +25,7 @@ Torchserve's main purpose is to serve models via http REST APIs, Torchserve is n
 Relevant issues: [[581](https://github.com/pytorch/serve/issues/581),[569](https://github.com/pytorch/serve/issues/569)]
 
 ### Are there any sample Models available?
-Various models are provided in Torchserve out of the box. Checkout out Torchserve [Model Zoo](https://github.com/pytorch/serve/blob/master/docs/model_zoo.md) for list of all available models. You can also check out the [examples](https://github.com/pytorch/serve/tree/master/examples) folder.
+Various models are provided in Torchserve out of the box. Checkout out Torchserve [Model Zoo](model_zoo.md) for list of all available models. You can also check out the [examples](https://github.com/pytorch/serve/tree/master/examples) folder.
 
 ### Does Torchserve support other models based on programming languages other than python?
 No, As of now only python based models are supported.
@@ -40,39 +40,39 @@ If a model converts international language string to bytes, client needs to use
 
 ## Deployment and config
 Relevant documents.
-- [Torchserve configuration](https://github.com/pytorch/serve/blob/master/docs/configuration.md)
-- [Model zoo](https://github.com/pytorch/serve/blob/master/docs/model_zoo.md#model-zoo)
-- [Snapshot](https://github.com/pytorch/serve/blob/master/docs/snapshot.md)
-- [Docker](../docker/README.md)
+- [Torchserve configuration](configuration.md)
+- [Model zoo](model_zoo.md)
+- [Snapshot](snapshot.md)
+- [Docker](https://github.com/pytorch/serve/blob/master/docker/README.md#docker-readme)
 
 ### Can I run Torchserve APIs on ports other than the default 8080 & 8081?
 Yes, Torchserve API ports are configurable using a properties file or environment variable.
-Refer [configuration.md](configuration.md) for more details.
+Refer to [configuration](configuration.md) for more details.
 
 
 ### How can I resolve model specific python dependency?
 You can provide a `requirements.txt` while creating a mar file using "--requirements-file/ -r" flag. Also, you can add dependency files using "--extra-files" flag.
-Refer [configuration.md](configuration.md) for more details.
+Refer to [configuration](configuration.md) for more details.
 
 ### Can I deploy Torchserve in Kubernetes?
 Yes, you can deploy Torchserve in Kubernetes using Helm charts.
-Refer [Kubernetes deployment ](../kubernetes/README.md) for more details.
+Refer [Kubernetes deployment ](https://github.com/pytorch/serve/blob/master/kubernetes/README.md#torchserve-kubernetes) for more details.
 
 ### Can I deploy Torchserve with AWS ELB and AWS ASG?
 Yes, you can deploy Torchserve on a multi-node ASG AWS EC2 cluster. There is a cloud formation template available [here](https://github.com/pytorch/serve/blob/master/examples/cloudformation/ec2-asg.yaml) for this type of deployment. Refer [ Multi-node EC2 deployment behind Elastic LoadBalancer (ELB)](https://github.com/pytorch/serve/tree/master/examples/cloudformation/README.md#multi-node-ec2-deployment-behind-elastic-loadbalancer-elb) more details.
 
 ### How can I backup and restore Torchserve state?
 TorchServe preserves server runtime configuration across sessions such that a TorchServe instance experiencing either a planned or unplanned service stop can restore its state upon restart. These saved runtime configuration files can be used for backup and restore.
-Refer [TorchServe model snapshot](snapshot.md#torchserve-model-snapshot) for more details.
+Refer to [TorchServe model snapshot](snapshot.md) for more details.
 
 ### How can I build a Torchserve image from source?
-Torchserve has a utility [script](../docker/build_image.sh) for creating docker images, the docker image can be hardware-based CPU or GPU compatible. A Torchserve docker image could be CUDA version specific as well.
+Torchserve has a utility [script](https://github.com/pytorch/serve/blob/master/docker/build_image.sh) for creating docker images, the docker image can be hardware-based CPU or GPU compatible. A Torchserve docker image could be CUDA version specific as well.
 
 All these docker images can be created using `build_image.sh` with appropriate options.
 
 Run `./build_image.sh --help` for all available options.
 
-Refer [Create Torchserve docker image from source](../docker/README.md#create-torchserve-docker-image) for more details.
+Refer to [Create Torchserve docker image from source](https://github.com/pytorch/serve/blob/master/docker/README.md#create-torchserve-docker-image) for more details.
 
 ### How to build a Torchserve image for a specific branch or commit id?
 To create a Docker image for a specific branch, use the following command:
@@ -91,11 +91,11 @@ The image created using Dockerfile.dev has Torchserve installed from source wher
 TorchServe looks for the config.property file according to the order listed in the [doc](https://github.com/pytorch/serve/blob/master/docs/configuration.md#configproperties-file). There is no override mechanism.
 
 ### What are model_store, load_models, models?
-- model_store: A mandatory argument during TorchServe start. It can be either defined in config.property or overridden by TorchServe command line option "[--model-store](https://github.com/pytorch/serve/blob/master/docs/configuration.md#command-line-parameters)".
+- model_store: A mandatory argument during TorchServe start. It can be either defined in config.property or overridden by TorchServe command line option "[--model-store](configuration.md)".
 
-- load_models: An optional argument during TorchServe start. It can be either defined in config.property or overridden by TorchServe command line option "[--models](https://github.com/pytorch/serve/blob/master/docs/configuration.md#command-line-parameters)".
+- load_models: An optional argument during TorchServe start. It can be either defined in config.property or overridden by TorchServe command line option "[--models](configuration.md)".
 
-- [models](https://github.com/pytorch/serve/blob/master/docs/configuration.md#command-line-parameters): Defines a list of models' configuration in config.property. A model's configuration can be overridden by [management API](https://github.com/pytorch/serve/blob/master/docs/management_api.md#register-a-model). It does not decide which models will be loaded during TorchServe start. There is no relationship b.w "models" and "load_models" (ie. TorchServe command line option [--models](https://github.com/pytorch/serve/blob/master/docs/configuration.md#command-line-parameters)).
+- [models](configuration.md): Defines a list of models' configuration in config.property. A model's configuration can be overridden by [management API](management_api.md). It does not decide which models will be loaded during TorchServe start. There is no relationship b.w "models" and "load_models" (ie. TorchServe command line option [--models](configuration.md)).
 
 ### 
 
@@ -108,43 +108,43 @@ You can use any tool like Postman, Insomnia or even use a python script to do so
 
 ### How can I add a custom API to an existing framework?
 You can add a custom API using **plugins SDK** available in Torchserve.
-Refer to [serving sdk](../serving-sdk) and [plugins](../plugins) for more details.
+Refer to [serving sdk](https://github.com/pytorch/serve/tree/master/serving-sdk) and [plugins](https://github.com/pytorch/serve/tree/master/plugins) for more details.
 
 ### How can pass multiple images in Inference request call to my model?
 You can provide multiple data in a single inference request to your custom handler as a key-value pair in the `data` object.
-Refer [this](https://github.com/pytorch/serve/issues/529#issuecomment-658012913) for more details.
+Refer to [this issue](https://github.com/pytorch/serve/issues/529#issuecomment-658012913) for more details.
 
 ## Handler
 Relevant documents
-- [Default handlers](default_handlers.md#torchserve-default-inference-handlers)
-- [Custom Handlers](custom_service.md#custom-handlers)
+- [Default handlers](default_handlers.md)
+- [Custom Handlers](custom_service.md)
 
 ### How do I return an image output for a model?
 You would have to write a custom handler and modify the postprocessing to return the image
-Refer [custom service documentation](custom_service.md#custom-handlers) for more details.
+Refer to [custom service documentation](custom_service.md) for more details.
 
 ### How to enhance the default handlers?
 Write a custom handler that extends the default handler and just override the methods to be tuned.
-Refer [custom service documentation](custom_service.md#custom-handlers) for more details.
+Refer to [custom service documentation](custom_service.md) for more details.
 
 ### Do I always have to write a custom handler or are there default ones that I can use?
 Yes, you can deploy your model with no-code/ zero code by using builtin default handlers.
-Refer [default handlers](default_handlers.md#torchserve-default-inference-handlers) for more details.
+Refer to [default handlers](default_handlers.md) for more details.
 
 ### Is it possible to deploy Hugging Face models?
 Yes, you can deploy Hugging Face models using a custom handler.
-Refer [HuggingFace_Transformers](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/README.md) for example. 
+Refer to [HuggingFace_Transformers](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/README.md#huggingface-transformers) for example. 
 
 ## Model-archiver
  Relevant documents
- - [Model-archiver ](../model-archiver/README.md#torch-model-archiver-for-torchserve)
- - [Docker Readme](../docker/README.md)
+ - [Model-archiver ](https://github.com/pytorch/serve/blob/master/model-archiver/README.md#torch-model-archiver-for-torchserve)
+ - [Docker Readme](https://github.com/pytorch/serve/blob/master/docker/README.md#docker-readme)
 
 ### What is a mar file?
 A mar file is a zip file consisting of all model artifacts with the ".mar" extension. The cmd-line utility `torch-model-archiver` is used to create a mar file.
 
 ### How can create mar file using Torchserve docker container?
-Yes, you create your mar file using a Torchserve container. Follow the steps given [here](../docker/README.md#create-torch-model-archiver-from-container).
+Yes, you create your mar file using a Torchserve container. Follow the steps given [here](https://github.com/pytorch/serve/blob/master/docker/README.md#create-torch-model-archiver-from-container).
 
 ### Can I add multiple serialized files in single mar file?
 Currently `torch-model-archiver` allows supplying only one serialized file with `--serialized-file` parameter while creating the mar. However, you can supply any number and any type of file with `--extra-files` flag. All the files supplied in the mar file are available in `model_dir` location which can be accessed through the context object supplied to the handler's entry point.
@@ -155,7 +155,7 @@ Sample code snippet:
 properties = context.system_properties
 model_dir = properties.get("model_dir")
 ```
-Refer [Torch model archiver cli](../model-archiver/README.md#torch-model-archiver-command-line-interface) for more details.
+Refer [Torch model archiver cli](https://github.com/pytorch/serve/blob/master/model-archiver/README.md#torch-model-archiver-command-line-interface) for more details.
 Relevant issues: [[#633](https://github.com/pytorch/serve/issues/633)]
 
 ### Can I download and register model using s3 presigned v4 url?
@@ -169,3 +169,10 @@ A mar file can be used either locally or be publicly available via http. An S3 U
 
 ### How to set a model's batch size on SageMaker?  Key parameters for TorchServe performance tuning.
 [TorchServe performance tuning example](https://github.com/lxning/torchserve_perf/blob/master/torchserve_perf.ipynb)
+
+## Why is my model initialization so slow?
+There's a few reasons why model initialization can be slow
+1. `torch.load()` overhead - not something we can improve, this will be more dramatic for larger models
+2. CUDA context launch overhead - not something we can control
+3. install_py_dep_per_model=true is intended for local development or sagemaker deployments, in other production environment you should pre install your dependencies
+4. The model archiver has an overhead to compress and decompress models, the compression is on by default because historically torchserve came out of sagemaker needs which involve loading and unloading tons of models stored in cloud buckets. But for users with smaller deployments choosing `torch-model-archiver --no-archive` is a good bet
diff --git a/docs/README.md b/docs/README.md
index 355a6e0268..8055e661ff 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -52,3 +52,4 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 * [TorchServe on Kubernetes](https://github.com/pytorch/serve/blob/master/kubernetes/README.md#torchserve-on-kubernetes) -  Demonstrates a Torchserve deployment in Kubernetes using Helm Chart supported in both Azure Kubernetes Service and Google Kubernetes service
 * [mlflow-torchserve](https://github.com/mlflow/mlflow-torchserve) - Deploy mlflow pipeline models into TorchServe
 * [Kubeflow pipelines](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/pytorch-samples) - Kubeflow pipelines and Google Vertex AI Managed pipelines
+* [NVIDIA MPS](mps.md) - Use NVIDIA MPS to optimize multi-worker deployment on a single GPU
diff --git a/docs/code_coverage.md b/docs/code_coverage.md
index 6f8a746bc3..bda87f1d0f 100644
--- a/docs/code_coverage.md
+++ b/docs/code_coverage.md
@@ -12,7 +12,7 @@
    ```bash
    python ts_scripts/install_dependencies.py --environment=dev --cuda=cu102
    ```
-   > Supported cuda versions as cu117, cu116, cu113, cu111, cu102, cu101, cu92
+   > Supported cuda versions as cu118, cu117, cu116, cu113, cu111, cu102, cu101, cu92
 
  - Execute sanity suite
    ```bash
diff --git a/docs/configuration.md b/docs/configuration.md
index 8177b88687..c86eabac16 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -188,7 +188,7 @@ Configuration parameter `install_py_dep_per_model` controls if the model server
 install_py_dep_per_model=true
 ```
 
-User can also supply custom python packages in zip or tar.gz format using the `--extra-files` flag while creating the model-archive and make an entry of the file name in the `requirements` file. 
+User can also supply custom python packages in zip or tar.gz format using the `--extra-files` flag while creating the model-archive and make an entry of the file name in the `requirements` file.
 
 ### Restrict backend worker to access environment variables
 
@@ -212,8 +212,6 @@ Set nvidia environment variables. For example:
 
 ### Enable metrics api
 * `enable_metrics_api` : Enable or disable metric apis i.e. it can be either `true` or `false`. Default: true (Enabled)
-* `metrics_format` : Use this to specify metric report format . At present, the only supported and default value for this is `prometheus`
-		     This is used in conjunction with `enable_metrics_api` option above.
 
 ### Config model
 * `models`: Use this to set configurations specific to a model. The value is presented in json format.
@@ -228,7 +226,7 @@ Set nvidia environment variables. For example:
     }
 }
 ```
-A model's parameters are defined in [model source code](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java#L24)
+A model's parameters are defined in [model source code](https://github.com/pytorch/serve/blob/a9e218ae95fe7690c84b555d0fb9021322c9b049/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java#L11)
 
 
 * `minWorkers`: the minimum number of workers of a model
@@ -239,7 +237,7 @@ A model's parameters are defined in [model source code](https://github.com/pytor
 * `defaultVersion`: the default version of a model
 * `marName`: the mar file name of a model
 
-A model's configuration example 
+A model's configuration example
 ```properties
 models={\
   "noop": {\
@@ -266,6 +264,13 @@ models={\
   }\
 }
 ```
+Starting from version 0.8.0, TorchServe allows for model configuration using a YAML file embedded in the MAR file. This YAML file contains two distinct parts that determine how a model is configured: frontend parameters and backend parameters. (see [details](https://github.com/pytorch/serve/tree/master/model-archiver#config-file))
+
+* The frontend parameters are controlled by TorchServe's frontend and specify the parameter name and default values. TorchServe now uses a priority order to determine the final value of a model's parameters in frontend. Specifically, the config.property file has the lowest priority, followed by the model configuration YAML file, and finally, the REST or gRPC model management API has the highest priority.
+
+* The backend parameters are fully controlled by the user. Users customized handler can access the backend parameters via the `model_yaml_config` property of the [context object](https://github.com/pytorch/serve/blob/a9e218ae95fe7690c84b555d0fb9021322c9b049/ts/context.py#L24). For example, context.model_yaml_config["pippy"]["rpc_timeout"].
+
+* User can allocate specific GPU device IDs to a model by defining "deviceIds" in the frontend parameters in the YAML file. TorchServe uses a round-robin strategy to assign device IDs to a model's worker. If specified in the YAML file, it round-robins the device IDs listed; otherwise, it uses all visible device IDs on the host.
 
 ### Other properties
 
@@ -276,27 +281,29 @@ Most of the following properties are designed for performance tuning. Adjusting
 * `netty_client_threads`: Number of backend netty thread. This specifies the number of threads in the WorkerThread [EventLoopGroup](https://livebook.manning.com/book/netty-in-action/chapter-8) which writes inference responses to the frontend. Default: number of logical processors available to the JVM.
 * `default_workers_per_model`: Number of workers to create for each model that loaded at startup time. Default: available GPUs in system or number of logical processors available to the JVM.
 * `job_queue_size`: Number inference jobs that frontend will queue before backend can serve. Default: 100.
+* `n_priorities`: Number of priorities that jobs can have supplied via header `X-TS-Priority`. Default: 1.
 * `async_logging`: Enable asynchronous logging for higher throughput, log output may be delayed if this is enabled. Default: false.
 * `default_response_timeout`: Timeout, in seconds, used for all models backend workers before they are deemed unresponsive and rebooted. Default: 120 seconds.
 * `unregister_model_timeout`: Timeout, in seconds, used when handling an unregister model request when cleaning a process before it is deemed unresponsive and an error response is sent. Default: 120 seconds.
 * `decode_input_request`: Configuration to let backend workers to decode requests, when the content type is known.
 If this is set to "true", backend workers do "Bytearray to JSON object" conversion when the content type is "application/json" and
-the backend workers convert "Bytearray to utf-8 string" when the Content-Type of the request is set to "text*". Default: true  
+the backend workers convert "Bytearray to utf-8 string" when the Content-Type of the request is set to "text*". Default: true
 * `initial_worker_port` : This is the initial port number for auto assigning port to worker process.
 * `model_store` : Path of model store directory.
-* `model_server_home` : Torchserve home directory. 
+* `model_server_home` : Torchserve home directory.
 * `max_request_size` : The maximum allowable request size that the Torchserve accepts, in bytes. Default: 6553500
 * `max_response_size` : The maximum allowable response size that the Torchserve sends, in bytes. Default: 6553500
-* `limit_max_image_pixels` : Default value is true (Use default [PIL.Image.MAX_IMAGE_PIXELS](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS)). If this is set to "false", set PIL.Image.MAX_IMAGE_PIXELS = None in backend default vision handler for large image payload. 
-* `allowed_urls` : Comma separated regex of allowed source URL(s) from where models can be registered. Default: "file://.*|http(s)?://.*" (all URLs and local file system)
+* `limit_max_image_pixels` : Default value is true (Use default [PIL.Image.MAX_IMAGE_PIXELS](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS)). If this is set to "false", set PIL.Image.MAX_IMAGE_PIXELS = None in backend default vision handler for large image payload.
+* `allowed_urls` : Comma separated regex of allowed source URL(s) from where models can be registered. Default: `file://.*|http(s)?://.*` (all URLs and local file system)
 e.g. : To allow base URLs `https://s3.amazonaws.com/` and `https://torchserve.pytorch.org/` use the following regex string `allowed_urls=https://s3.amazonaws.com/.*,https://torchserve.pytorch.org/.*`
 * `workflow_store` : Path of workflow store directory. Defaults to model store directory.
+* `disable_system_metrics` : Disable collection of system metrics when set to "true". Default value is "false".
 
 **NOTE**
 
 All the above config properties can be set using environment variable as follows.
 - set `enable_envvars_config` to true in config.properties
-- export environment variable for property as`TS_<PROPERTY_NAME>`. 
+- export environment variable for property as`TS_<PROPERTY_NAME>`.
 
   e.g.: to set inference_address property run cmd
   `export TS_INFERENCE_ADDRESS="http://127.0.0.1:8082"`.
diff --git a/docs/contents.rst b/docs/contents.rst
index a6f23d8d45..7d255be9f0 100644
--- a/docs/contents.rst
+++ b/docs/contents.rst
@@ -3,7 +3,7 @@
   :numbered:
   :caption: Contents:
   :titlesonly:
-  
+
   index
   Troubleshooting
   batch_inference_with_ts
@@ -16,20 +16,22 @@
   model_zoo
   request_envelopes
   server
+  mps
   snapshot
   sphinx/requirements
   torchserve_on_win_native
   torchserve_on_wsl
   use_cases
   workflows
+  large_model_inference
 
 .. toctree::
   :maxdepth: 0
   :caption: Service APIs:
-  
+
   apis
 
 .. toctree::
   :caption: Developer APIs:
-  
-  api/dev_api
\ No newline at end of file
+
+  api/dev_api
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 029b67471e..11daf6db7b 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -14,7 +14,7 @@
         python ./ts_scripts/install_dependencies.py
         ```
 
-     - For GPU with Cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`
+     - For GPU with Cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`
 
        ```bash
        python ./ts_scripts/install_dependencies.py --cuda=cu102
diff --git a/docs/grpc_api.md b/docs/grpc_api.md
index 4583b23b72..d533044736 100644
--- a/docs/grpc_api.md
+++ b/docs/grpc_api.md
@@ -9,6 +9,7 @@ TorchServe provides following gRPCs apis
 * [Inference API](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/proto/inference.proto)
   - **Ping** : Gets the health status of the running server
   - **Predictions** : Gets predictions from the served model
+  - **StreamPredictions** : Gets server side streaming predictions from the saved model
 
 * [Management API](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/proto/management.proto)
   - **RegisterModel** : Serve a model/model-version on TorchServe
@@ -70,3 +71,28 @@ python ts_scripts/torchserve_grpc_client.py infer densenet161 examples/image_cla
 ```bash
 python ts_scripts/torchserve_grpc_client.py unregister densenet161
 ```
+## GRPC Server Side Streaming
+TorchServe GRPC APIs adds a server side streaming of the inference API "StreamPredictions" to allow a sequence of inference responses to be sent over the same GRPC stream. This new API is only recommended for use case when the inference latency of the full response is high and the inference intermediate results are sent to client. An example could be LLMs for generative applications, where generating "n" number of tokens can have high latency, in this case user can receive each generated token once ready until the full response completes. This new API automatically forces the batchSize to be one.
+
+```
+service InferenceAPIsService {
+    // Check health status of the TorchServe server.
+    rpc Ping(google.protobuf.Empty) returns (TorchServeHealthResponse) {}
+
+    // Predictions entry point to get inference using default model version.
+    rpc Predictions(PredictionsRequest) returns (PredictionResponse) {}
+
+    // Streaming response for an inference request.
+    rpc StreamPredictions(PredictionsRequest) returns (stream PredictionResponse) {}
+}
+```
+Backend handler calls "send_intermediate_predict_response" to send one intermediate result to frontend, and return the last result as the existing style. For example
+```
+from ts.protocol.otf_message_handler import send_intermediate_predict_response
+
+def handle(data, context):
+    if type(data) is list:
+        for i in range (3):
+            send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
+        return ["hello world "]
+```
\ No newline at end of file
diff --git a/docs/images/mps_g4_single.png b/docs/images/mps_g4_single.png
new file mode 100644
index 0000000000..a35b79abd2
Binary files /dev/null and b/docs/images/mps_g4_single.png differ
diff --git a/docs/images/mps_g4_two_worker.png b/docs/images/mps_g4_two_worker.png
new file mode 100644
index 0000000000..7dcbf53bec
Binary files /dev/null and b/docs/images/mps_g4_two_worker.png differ
diff --git a/docs/images/mps_p3_single.png b/docs/images/mps_p3_single.png
new file mode 100644
index 0000000000..be1b88f9d2
Binary files /dev/null and b/docs/images/mps_p3_single.png differ
diff --git a/docs/images/mps_p3_two_worker.png b/docs/images/mps_p3_two_worker.png
new file mode 100644
index 0000000000..06339999d8
Binary files /dev/null and b/docs/images/mps_p3_two_worker.png differ
diff --git a/docs/images/ts-lmi-internal.png b/docs/images/ts-lmi-internal.png
new file mode 100644
index 0000000000..838eb40915
Binary files /dev/null and b/docs/images/ts-lmi-internal.png differ
diff --git a/docs/index.md b/docs/index.md
index 824d7ab259..a01b1ce5d7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,8 +1,23 @@
 # TorchServe
 
-TorchServe is a performant, flexible and easy to use tool for serving PyTorch eager mode and torschripted models.
+TorchServe is a performant, flexible and easy to use tool for serving PyTorch models in production.
 
-## Basic Features
+
+## ⚡ Why TorchServe
+* [Model Management API](management_api.md): multi model management with optimized worker to model allocation
+* [Inference API](inference_api.md): REST and gRPC support for batched inference
+* [TorchServe Workflows](https://github.com/pytorch/serve/blob/master/examples/Workflows/README.md#workflow-examples): deploy complex DAGs with multiple interdependent models
+* Default way to serve PyTorch models in
+  * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
+  * [MLflow](https://github.com/mlflow/mlflow-torchserve)
+  * [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
+  * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API
+  * [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
+* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
+* [Performance Guide](performance_guide.md): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
+* [Expressive handlers](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md#contributing-to-torchServe): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
+* [Metrics API](metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
+## 🤔 How does TorchServe work
 
 * [Serving Quick Start](https://github.com/pytorch/serve/blob/master/README.md#serve-a-model) - Basic server usage tutorial
 * [Model Archive Quick Start](https://github.com/pytorch/serve/tree/master/model-archiver#creating-a-model-archive) - Tutorial that shows you how to package a model archive file.
@@ -29,14 +44,19 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 * [Object Detector](https://github.com/pytorch/serve/blob/master/ts/torch_handler/object_detector.py) - This handler takes an image and returns list of detected classes and bounding boxes respectively
 * [Image Segmenter](https://github.com/pytorch/serve/blob/master/ts/torch_handler/image_segmenter.py)- This handler takes an image and returns output shape as [CL H W], CL - number of classes, H - height and W - width
 
-## Examples
+## 🏆 Highlighted Examples
+
+* [🤗 HuggingFace Transformers](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers) with a [Better Transformer Integration](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers#Speed-up-inference-with-Better-Transformer)
+* [Model parallel inference](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers#model-parallelism)
+* [MultiModal models with MMF](https://github.com/pytorch/serve/tree/master/examples/MMF-activity-recognition) combining text, audio and video
+* [Dual Neural Machine Translation](https://github.com/pytorch/serve/blob/master/examples/Workflows/nmt_transformers_pipeline) for a complex workflow DAG
+* [TorchServe Integrations](https://github.com/pytorch/serve/blob/master/examples/README.md#torchserve-integrations)
+* [TorchServe Internals](https://github.com/pytorch/serve/blob/master/examples/README.md#torchserve-internals)
+* [TorchServe UseCases](https://github.com/pytorch/serve/blob/master/examples/README.md#usecases)
+* [Model Zoo](https://github.com/pytorch/serve/blob/master/docs/model_zoo.md) - List of pre-trained model archives ready to be served for inference with TorchServe.
+
+For [more examples](https://github.com/pytorch/serve/blob/master/examples/README.md#torchserve-internals)
 
-* [HuggingFace Language Model](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/Transformer_handler_generalized.py) - This handler takes an input sentence and can return sequence classifications, token classifications or Q&A answers
-* [Multi Modal Framework](https://github.com/pytorch/serve/blob/master/examples/MMF-activity-recognition/handler.py) - Build and deploy a classifier that combines text, audio and video input data
-* [Dual Translation Workflow](https://github.com/pytorch/serve/tree/master/examples/Workflows/nmt_transformers_pipeline) - 
-* [Model Zoo](model_zoo.md) - List of pre-trained model archives ready to be served for inference with TorchServe.
-* [Examples](https://github.com/pytorch/serve/tree/master/examples) - Many examples of how to package and deploy models with TorchServe
-* [Workflow Examples](https://github.com/pytorch/serve/tree/master/examples/Workflows) - Examples of how to compose models in a workflow with TorchServe
 
 ## Advanced Features
 
@@ -49,3 +69,30 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 * [TorchServe on Kubernetes](https://github.com/pytorch/serve/blob/master/kubernetes/README.md#torchserve-on-kubernetes) -  Demonstrates a Torchserve deployment in Kubernetes using Helm Chart supported in both Azure Kubernetes Service and Google Kubernetes service
 * [mlflow-torchserve](https://github.com/mlflow/mlflow-torchserve) - Deploy mlflow pipeline models into TorchServe
 * [Kubeflow pipelines](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/pytorch-samples) - Kubeflow pipelines and Google Vertex AI Managed pipelines
+* [NVIDIA MPS](mps.md) - Use NVIDIA MPS to optimize multi-worker deployment on a single GPU
+
+## 📰 News
+* [Torchserve Performance Tuning, Animated Drawings Case-Study](https://pytorch.org/blog/torchserve-performance-tuning/)
+* [Walmart Search: Serving Models at a Scale on TorchServe](https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d)
+* [🎥 Scaling inference on CPU with TorchServe](https://www.youtube.com/watch?v=066_Jd6cwZg)
+* [🎥 TorchServe C++ backend](https://www.youtube.com/watch?v=OSmGGDpaesc)
+* [Grokking Intel CPU PyTorch performance from first principles: a TorchServe case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex.html)
+* [Grokking Intel CPU PyTorch performance from first principles( Part 2): a TorchServe case study](https://pytorch.org/tutorials/intermediate/torchserve_with_ipex_2.html)
+* [Case Study: Amazon Ads Uses PyTorch and AWS Inferentia to Scale Models for Ads Processing](https://pytorch.org/blog/amazon-ads-case-study/)
+* [Optimize your inference jobs using dynamic batch inference with TorchServe on Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/optimize-your-inference-jobs-using-dynamic-batch-inference-with-torchserve-on-amazon-sagemaker/)
+* [Using AI to bring children's drawings to life](https://ai.facebook.com/blog/using-ai-to-bring-childrens-drawings-to-life/)
+* [🎥 Model Serving in PyTorch](https://www.youtube.com/watch?v=2A17ZtycsPw)
+* [Evolution of Cresta's machine learning architecture: Migration to AWS and PyTorch](https://aws.amazon.com/blogs/machine-learning/evolution-of-crestas-machine-learning-architecture-migration-to-aws-and-pytorch/)
+* [🎥 Explain Like I’m 5: TorchServe](https://www.youtube.com/watch?v=NEdZbkfHQCk)
+* [🎥 How to Serve PyTorch Models with TorchServe](https://www.youtube.com/watch?v=XlO7iQMV3Ik)
+* [How to deploy PyTorch models on Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
+* [Quantitative Comparison of Serving Platforms](https://biano-ai.github.io/research/2021/08/16/quantitative-comparison-of-serving-platforms-for-neural-networks.html)
+* [Efficient Serverless deployment of PyTorch models on Azure](https://medium.com/pytorch/efficient-serverless-deployment-of-pytorch-models-on-azure-dc9c2b6bfee7)
+* [Deploy PyTorch models with TorchServe in Azure Machine Learning online endpoints](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/deploy-pytorch-models-with-torchserve-in-azure-machine-learning/ba-p/2466459)
+* [Dynaboard moving beyond accuracy to holistic model evaluation in NLP](https://ai.facebook.com/blog/dynaboard-moving-beyond-accuracy-to-holistic-model-evaluation-in-nlp/)
+* [A MLOps Tale about operationalising MLFlow and PyTorch](https://medium.com/mlops-community/engineering-lab-1-team-1-a-mlops-tale-about-operationalising-mlflow-and-pytorch-62193b55dc19)
+* [Operationalize, Scale and Infuse Trust in AI Models using KFServing](https://blog.kubeflow.org/release/official/2021/03/08/kfserving-0.5.html)
+* [How Wadhwani AI Uses PyTorch To Empower Cotton Farmers](https://medium.com/pytorch/how-wadhwani-ai-uses-pytorch-to-empower-cotton-farmers-14397f4c9f2b)
+* [TorchServe Streamlit Integration](https://cceyda.github.io/blog/huggingface/torchserve/streamlit/ner/2020/10/09/huggingface_streamlit_serve.html)
+* [Dynabench aims to make AI models more robust through distributed human workers](https://venturebeat.com/2020/09/24/facebooks-dynabench-aims-to-make-ai-models-more-robust-through-distributed-human-workers/)
+* [Announcing TorchServe](https://aws.amazon.com/blogs/aws/announcing-torchserve-an-open-source-model-server-for-pytorch/)
diff --git a/docs/inference_api.md b/docs/inference_api.md
index ef85e4ffc5..988aabd7a4 100644
--- a/docs/inference_api.md
+++ b/docs/inference_api.md
@@ -1,4 +1,4 @@
-# Inference API
+# [Inference API](#inference-api)
 
 Inference API is listening on port 8080 and only accessible from localhost by default. To change the default setting, see [TorchServe Configuration](configuration.md).
 
@@ -41,6 +41,11 @@ If the server is running, the response is:
 }
 ```
 
+"maxRetryTimeoutInSec" (default: 5MIN) can be defined in a model's config yaml file(e.g model-config.yaml). It is the maximum time window of recovering a dead backend worker. A healthy worker can be in the state: WORKER_STARTED, WORKER_MODEL_LOADED, or WORKER_STOPPED within maxRetryTimeoutInSec window. "Ping" endpoint"
+* return 200 + json message "healthy": for any model, the number of active workers is equal or larger than the configured minWorkers.
+* return 500 + json message "unhealthy": for any model, the number of active workers is less than the configured minWorkers.
+
+
 ## Predictions API
 
 This API follows the [InferenceAPIsService.Predictions](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/proto/inference.proto) gRPC API. It returns the status of a model in the ModelServer.
@@ -75,7 +80,7 @@ To get predictions from a specific version of each loaded model, make a REST cal
 
 * POST /predictions/{model_name}/{version}
 
-## curl Example
+### curl Example
 
 ```bash
 curl -O https://raw.githubusercontent.com/pytorch/serve/master/docs/images/kitten_small.jpg
@@ -95,6 +100,34 @@ The result is JSON that tells you that the image is most likely a tabby cat. The
     "probability": 0.42514491081237793
 }
 ```
+* Streaming response via HTTP 1.1 chunked encoding
+TorchServe the inference API support streaming response to allow a sequence of inference responses to be sent over HTTP 1.1 chunked encoding. This new feature is only recommended for use case when the inference latency of the full response is high and the inference intermediate results are sent to client. An example could be LLMs for generative applications, where generating "n" number of tokens can have high latency, in this case user can receive each generated token once ready until the full response completes. To achieve streaming response, backend handler calls "send_intermediate_predict_response" to send one intermediate result to frontend, and return the last result as the existing style. For example,
+```
+from ts.protocol.otf_message_handler import send_intermediate_predict_response
+def handle(data, context):
+    if type(data) is list:
+        for i in range (3):
+            send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
+        return ["hello world "]
+```
+Client side receives the chunked data.
+```
+def test_echo_stream_inference():
+    test_utils.start_torchserve(no_config_snapshots=True, gen_mar=False)
+    test_utils.register_model('echo_stream',
+                              'https://torchserve.pytorch.org/mar_files/echo_stream.mar')
+
+    response = requests.post(TF_INFERENCE_API + '/predictions/echo_stream', data="foo", stream=True)
+    assert response.headers['Transfer-Encoding'] == 'chunked'
+
+    prediction = []
+    for chunk in (response.iter_content(chunk_size=None)):
+        if chunk:
+            prediction.append(chunk.decode("utf-8"))
+
+    assert str(" ".join(prediction)) == "hello hello hello hello world "
+    test_utils.unregister_model('echo_stream')
+```
 ## Explanations API
 
 Torchserve makes use of Captum's functionality to return the explanations of the models that is served.
@@ -181,10 +214,9 @@ The result is a json that gives you the explanations for the input json
           0.007599905146155397,
           ,
           ,
-	        ,           
+	        ,
         ]
       ]
     ]
   ]
 }
-
diff --git a/docs/large_model_inference.md b/docs/large_model_inference.md
new file mode 100644
index 0000000000..dbeaf47248
--- /dev/null
+++ b/docs/large_model_inference.md
@@ -0,0 +1,207 @@
+# Serving large models with Torchserve
+
+This document explain how Torchserve supports large model serving, here large model refers to the models that are not able to fit into one gpu so they need be split in multiple partitions over multiple gpus.
+
+## How it works?
+
+During deployment a worker of a large model, TorchServe utilizes [torchrun](https://pytorch.org/docs/stable/elastic/run.html) to set up the distributed environment for model parallel processing. TorchServe has the capability to support multiple workers for a large model. By default, TorchServe uses a round-robin algorithm to assign GPUs to a worker on a host. In case of large models inference GPUs assigned to each worker is automatically calculated based on number of GPUs specified in the model_config.yaml. CUDA_VISIBLE_DEVICES is set based this number.
+
+For instance, suppose there are eight GPUs on a node and one worker needs 4 GPUs (ie, nproc-per-node=4) on a node. In this case, TorchServe would assign CUDA_VISIBLE_DEVICES="0,1,2,3" to worker1 and CUDA_VISIBLE_DEVICES="4,5,6,7" to worker2.
+
+In addition to this default behavior, TorchServe provides the flexibility for users to specify GPUs for a worker. For instance, if the user sets "deviceIds: [2,3,4,5]" in the [model config YAML file](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/model-archiver/README.md?plain=1#L164), and nproc-per-node is set to 2, then TorchServe would assign CUDA_VISIBLE_DEVICES="2,3" to worker1 and CUDA_VISIBLE_DEVICES="4,5" to worker2.
+
+Using Pippy integration as an example, the image below illustrates the internals of the TorchServe large model inference.
+
+![ts-lmi-internal](images/ts-lmi-internal.png)
+
+## PiPPy (PyTorch Native solution for large model inference)
+
+PiPPy provides pipeline parallelism for serving large models that would not fit into one gpu. It takes your model and splits it into equal sizes (stages) partitioned over the number devices you specify. Then uses microbatching to run your batched input for inference ( its is more optimal for batch sizes >1).
+
+### How to use PiPPy in Torchserve
+
+To use Pippy in Torchserve, we need to use a custom handler which inherits from base_pippy_handler and put our setting in model-config.yaml.
+
+Customer handler in Torchserve is simply a python script that defines model loading, preprocess, inference and postprocess logic specific to your workflow.
+
+It would look like below:
+
+Create `custom_handler.py` or any other descriptive name.
+
+```python
+#DO import the necessary packages along with following
+from ts.torch_handler.distributed.base_pippy_handler import BasePippyHandler
+from ts.handler_utils.distributed.pt_pippy import initialize_rpc_workers, get_pipline_driver
+class ModelHandler(BasePippyHandler, ABC):
+    def __init__(self):
+        super(ModelHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        model = # load your model from model_dir
+        self.device = self.local_rank %  torch.cuda.device_count()# being used to move model inputs to (self.device)
+        self.model = get_pipline_driver(model,self.world_size, ctx)
+
+```
+
+Here is what your `model-config.yaml` needs, this config file is very flexible, you can add setting related to frontend, backend and handler.
+
+```yaml
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+deviceType: "gpu"
+parallelType: "pp" # options depending on the solution, pp(pipeline parallelism), tp(tensor parallelism), pptp ( pipeline and tensor parallelism)
+                   # This will be used to route input to either rank0 or all ranks from fontend based on the solution (e.g. DeepSpeed support tp, PiPPy support pp)
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                      # gpus you wish to split your model
+#backend settings
+pippy:
+    chunks: 1 # This sets the microbatch sizes, microbatch = batch size/ chunks
+    input_names: ['input_ids'] # input arg names to the model, this is required for FX tracing
+    model_type: "HF" # set the model type to HF if you are using Huggingface model other wise leave it blank or any other model you use.
+    rpc_timeout: 1800
+    num_worker_threads: 512 #set number of threads for rpc worker init.
+
+handler:
+    max_length: 80 # max length of tokens for tokenizer in the handler
+```
+
+**How to access it in the handler?** here is an example:
+
+```python
+def initialize(self, ctx):
+    model_type = ctx.model_yaml_config["pippy"]["model_type"]
+
+```
+
+The rest is as usual in Torchserve, basically packaging your model and starting the server.
+
+Example of the command for packaging your model, make sure you pass model-config.yaml
+
+```bash
+torch-model-archiver --model-name bloom --version 1.0 --handler pippy_handler.py --extra-files $MODEL_CHECKPOINTS_PATH -r requirements.txt --config-file model-config.yaml --archive-format tgz
+
+```
+
+Tensor Parallel support in progress and will be added as soon as ready.
+
+## DeepSpeed
+
+[DeepSpeed-Inference](https://www.deepspeed.ai/inference/) is an open source project of MicroSoft. It provides model parallelism for serving large transformer based PyTorch models that would not fit into one gpu memory.
+
+
+### How to use DeepSpeed in TorchServe
+
+To use DeepSpeed in TorchServe, we need to use a custom handler which inherits from base_deepspeed_handler and put our setting in model-config.yaml.
+
+It would look like below:
+
+Create `custom_handler.py` or any other descriptive name.
+
+```python
+#DO import the necessary packages along with following
+from ts.handler_utils.distributed.deepspeed import get_ds_engine
+from ts.torch_handler.distributed.base_deepspeed_handler import BaseDeepSpeedHandler
+class ModelHandler(BaseDeepSpeedHandler, ABC):
+    def __init__(self):
+        super(ModelHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        model = # load your model from model_dir
+        ds_engine = get_ds_engine(self.model, ctx)
+        self.model = ds_engine.module
+        self.initialized = True
+```
+
+Here is what your `model-config.yaml` needs, this config file is very flexible, you can add setting related to frontend, backend and handler.
+
+```yaml
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+deviceType: "gpu"
+parallelType: "tp" # options depending on the solution, pp(pipeline parallelism), tp(tensor parallelism), pptp ( pipeline and tensor parallelism)
+                   # This will be used to route input to either rank0 or all ranks from fontend based on the solution (e.g. DeepSpeed support tp, PiPPy support pp)
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                      # gpus you wish to split your model
+#backend settings
+deepspeed:
+    config: ds-config.json # DeepSpeed config json filename.
+                           # Details:https://www.deepspeed.ai/docs/config-json/
+handler:
+    max_length: 80 # max length of tokens for tokenizer in the handler
+```
+
+Here is an example of `ds-config.json`
+
+```json
+{
+  "dtype": "torch.float16",
+  "replace_with_kernel_inject": true,
+  "tensor_parallel": {
+    "tp_size": 2
+  }
+}
+```
+
+**Install DeepSpeed**
+
+*Method1*: requirements.txt
+
+*Method2*: pre-install via command (Recommended to speed up model loading)
+
+```bash
+# See https://www.deepspeed.ai/tutorials/advanced-install/
+DS_BUILD_OPS=1 pip install deepspeed
+```
+
+The rest is as usual in Torchserve, basically packaging your model and starting the server.
+
+Example of the command for packaging your model, make sure you pass model-config.yaml
+
+```bash
+# option 1: Using model_dir
+torch-model-archiver --model-name bloom --version 1.0 --handler deepspeed_handler.py --extra-files $MODEL_CHECKPOINTS_PATH,ds-config.json -r requirements.txt --config-file model-config.yaml --archive-format tgz
+
+# option 2: Using HF model_name
+torch-model-archiver --model-name bloom --version 1.0 --handler deepspeed_handler.py --extra-files ds-config.json -r requirements.txt --config-file model-config.yaml --archive-format
+```
+
+## Best Practice
+
+#### To reduce model loading latency, we recommend
+* Pre-install the model parallel library such as Deepspeed on the container or host.
+* Pre-download the model checkpoints. For example, if using HuggingFace pretrained model can be pre-downloaded via [Download_model.py](https://github.com/pytorch/serve/blob/75f66dc557b3b67a3ab56536a37d7aa21582cc04/examples/large_models/deepspeed/opt/Readme.md?plain=1#L7)
+  * Set environment variable [HUGGINGFACE_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#understand-caching) and [TRANSFORMERS_CACHE](https://huggingface.co/transformers/v4.0.1/installation.html#caching-models)
+  * Download model to the HuggingFace cache dir via tool [Download_model.py](https://github.com/pytorch/serve/blob/4fe5273cd6f98fb5abc570f802b402ac32ecd105/examples/large_models/Huggingface_pippy/Readme.md?plain=1#L20)
+
+#### Tune "[responseTimeout](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/docs/configuration.md?plain=1#L216)" (see [model config YAML file](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/model-archiver/README.md?plain=1#L164)) if high model loading or inference latency causes response timeout.
+
+#### Tune torchrun parameters
+User is able to tune torchrun parameters in [model config YAML file](https://github.com/pytorch/serve/blob/2f1f52f553e83703b5c380c2570a36708ee5cafa/model-archiver/README.md?plain=1#L179). The supported parameters are defined at [here](https://github.com/pytorch/serve/blob/2f1f52f553e83703b5c380c2570a36708ee5cafa/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java#L329). For example, by default, `OMP_NUMNER_T?HREADS` is 1. It can be modified in the YAML file.
+```yaml
+#frontend settings
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                      # gpus you wish to split your model
+    OMP_NUMBER_THREADS: 2
+```
+#### Feature Job ticket is recommended for the use case of inference latency sensitive
+When the job ticket feature is enabled, TorchServe verifies the availability of a model's active worker for processing a client's request. If an active worker is available, the request is accepted and processed immediately without waiting time incurred from job queue or dynamic batching; otherwise, a 503 response is sent back to client.
+
+This feature help with use cases where inference latency can be high, such as generative models, auto regressive decoder models like chatGPT. This feature help such applications to take effective actions, for example, routing the rejected request to a different server, or scaling up model server capacity, based on the business requirements. Here is an example of enabling job ticket.
+```yaml
+minWorkers: 2
+maxWorkers: 2
+jobQueueSize: 2
+useJobTicket: true
+```
+In this example, a model has 2 workers with job queue size 2. An inference request will be either processed by TorchServe immediately, or rejected with response code 503.
diff --git a/docs/management_api.md b/docs/management_api.md
index c7e7af5d9f..991746fe52 100644
--- a/docs/management_api.md
+++ b/docs/management_api.md
@@ -1,4 +1,4 @@
-# Management API
+# [Management API](#management-api)
 
 TorchServe provides the following APIs that allows you to manage models at runtime:
 
@@ -41,13 +41,13 @@ curl -X POST  "http://localhost:8081/models?url=https://torchserve.pytorch.org/m
 }
 ```
 
-### Encrypted model serving 
+### Encrypted model serving
 If you'd like to serve an encrypted model then you need to setup [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with the following environment variables:
 * AWS_ACCESS_KEY_ID
 * AWS_SECRET_ACCESS_KEY
 * AWS_DEFAULT_REGION
 
-And set "s3_sse_kms=true" in HTTP request. 
+And set "s3_sse_kms=true" in HTTP request.
 
 For example: model squeezenet1_1 is [encrypted on S3 under your own private account](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html). The model http url on S3 is `https://torchserve.pytorch.org/sse-test/squeezenet1_1.mar`.
 - if torchserve will run on EC2 instance (e.g. OS: ubuntu)
@@ -86,7 +86,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=fals
 < x-request-id: 4dc54158-c6de-42aa-b5dd-ebcb5f721043
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -102,7 +102,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=true
 < x-request-id: ecd2e502-382f-4c3b-b425-519fbf6d3b85
 < content-length: 89
 < connection: keep-alive
-< 
+<
 {
   "status": "Model \"squeezenet1_1\" Version: 1.0 registered with 1 initial workers"
 }
@@ -118,7 +118,7 @@ This API follows the [ManagementAPIsService.ScaleWorker](https://github.com/pyto
 * `min_worker` - (optional) the minimum number of worker processes. TorchServe will try to maintain this minimum for specified model. The default value is `1`.
 * `max_worker` - (optional) the maximum number of worker processes. TorchServe will make no more that this number of workers for the specified model. The default is the same as the setting for `min_worker`.
 * `synchronous` - whether or not the call is synchronous. The default value is `false`.
-* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`. 
+* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`.
 
 Use the Scale Worker API to dynamically adjust the number of workers for any version of a model to better serve different inference request loads.
 
@@ -134,7 +134,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3"
 < x-request-id: 42adc58e-6956-4198-ad07-db6c620c4c1e
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -150,7 +150,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3&synchronous=true"
 < x-request-id: b72b1ea0-81c6-4cce-92c4-530d3cfe5d4a
 < content-length: 63
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop"
 }
@@ -169,7 +169,7 @@ curl -v -X PUT "http://localhost:8081/models/noop/2.0?min_worker=3&synchronous=t
 < x-request-id: 3997ccd4-ae44-4570-b249-e361b08d3d47
 < content-length: 77
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop, version: 2.0"
 }
@@ -290,7 +290,7 @@ curl http://localhost:8081/models/noop/all
 ```
 
 `GET /models/{model_name}/{model_version}?customized=true`
-or 
+or
 `GET /models/{model_name}?customized=true`
 
 Use the Describe Model API to get detail runtime status and customized metadata of a version of a model:
diff --git a/docs/metrics.md b/docs/metrics.md
index 5ca5ee2230..9993948683 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -1,4 +1,4 @@
-# TorchServe Metrics
+# [TorchServe Metrics](#torchserve-metrics)
 
 ## Contents of this document
 
@@ -13,42 +13,61 @@
 
 ## Introduction
 
-TorchServe collects system level metrics in regular intervals, and also provides an API to collect custom metrics.
-Metrics collected by metrics are logged and can be aggregated by metric agents.
-The system level metrics are collected every minute. Metrics defined by the custom service code can be collected per request or per a batch of requests.
-TorchServe logs these two sets of metrics to different log files.
-Metrics are collected by default at:
+Torchserve metrics can be broadly classified into frontend and backend metrics.
+Frontend metrics include system level metrics. The host resource utilization frontend metrics are collected at regular intervals(default: every minute).
+Torchserve provides an API to collect custom backend metrics. Metrics defined by a custom service or handler code can be collected per request or per a batch of requests.
+Two metric modes are supported, i.e `log` and `prometheus`. The default mode is `log`.
+Metrics mode can be configured using the `metrics_mode` configuration option in `config.properties` or `TS_METRICS_MODE` environment variable.
+For further details on `config.properties` and environment variable based configuration, refer [Torchserve config](configuration.md) docs.
 
-* System metrics - `log_directory/ts_metrics.log`
-* Custom metrics - `log directory/model_metrics.log`
+In `log` mode, Metrics are logged and can be aggregated by metric agents.
+Metrics are collected by default at the following locations in `log` mode:
+
+* Frontend metrics - `log_directory/ts_metrics.log`
+* Backend metrics - `log directory/model_metrics.log`
 
 The location of log files and metric files can be configured in the [log4j2.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/log4j2.xml) file
 
-## System Metrics
-
-| Metric Name | Dimension | Unit | Semantics |
-|---|---|---|---|
-| CPUUtilization | host | percentage | CPU utilization on host |
-| DiskAvailable | host | GB | disk available on host |
-| DiskUsed | host | GB | disk used on host |
-| DiskUtilization | host | percentage | disk used on host |
-| MemoryAvailable | host | MB | memory available on host |
-| MemoryUsed | host | MB | memory used on host |
-| MemoryUtilization | host | percentage | memory utilization on host |
-| GPUUtilization | host,device_id | percentage | GPU utilization on host,device_id |
-| GPUMemoryUtilization | host,device_id | percentage | GPU memory utilization on host,device_id |
-| GPUMemoryUsed | host,device_id | MB | GPU memory used on host,device_id |
-| Requests2XX | host | count | logged for every request responded in 200-300 status code range |
-| Requests4XX | host |count | logged for every request responded in 400-500 status code range |
-| Requests5XX | host | count | logged for every request responded with status code above 500 |
+In `prometheus` mode, all metrics are made available in prometheus format via the [metrics](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md) API endpoint.
+
+## Frontend Metrics
+
+| Metric Name                       | Type    | Unit         | Dimensions                          | Semantics                                                                   |
+|-----------------------------------|---------|--------------|-------------------------------------|-----------------------------------------------------------------------------|
+| Requests2XX                       | counter | Count        | Level, Hostname                     | Total number of requests with response in 200-300 status code range         |
+| Requests4XX                       | counter | Count        | Level, Hostname                     | Total number of requests with response in 400-500 status code range         |
+| Requests5XX                       | counter | Count        | Level, Hostname                     | Total number of requests with response status code above 500                |
+| ts_inference_requests_total       | counter | Count        | model_name, model_version, hostname | Total number of inference requests received                                 |
+| ts_inference_latency_microseconds | counter | Microseconds | model_name, model_version, hostname | Total inference latency in Microseconds                                     |
+| ts_queue_latency_microseconds     | counter | Microseconds | model_name, model_version, hostname | Total queue latency in Microseconds                                         |
+| QueueTime                         | gauge   | Milliseconds | Level, Hostname                     | Time spent by a job in request queue in Milliseconds                        |
+| WorkerThreadTime                  | gauge   | Milliseconds | Level, Hostname                     | Time spent in worker thread excluding backend response time in Milliseconds |
+| WorkerLoadTime                    | gauge   | Milliseconds | WorkerName, Level, Hostname         | Time taken by worker to load model in Milliseconds                          |
+| CPUUtilization                    | gauge   | Percent      | Level, Hostname                     | CPU utilization on host                                                     |
+| MemoryUsed                        | gauge   | Megabytes    | Level, Hostname                     | Memory used on host                                                         |
+| MemoryAvailable                   | gauge   | Megabytes    | Level, Hostname                     | Memory available on host                                                    |
+| MemoryUtilization                 | gauge   | Percent      | Level, Hostname                     | Memory utilization on host                                                  |
+| DiskUsage                         | gauge   | Gigabytes    | Level, Hostname                     | Disk used on host                                                           |
+| DiskUtilization                   | gauge   | Percent      | Level, Hostname                     | Disk used on host                                                           |
+| DiskAvailable                     | gauge   | Gigabytes    | Level, Hostname                     | Disk available on host                                                      |
+| GPUMemoryUtilization              | gauge   | Percent      | Level, DeviceId, Hostname           | GPU memory utilization on host, DeviceId                                    |
+| GPUMemoryUsed                     | gauge   | Megabytes    | Level, DeviceId, Hostname           | GPU memory used on host, DeviceId                                           |
+| GPUUtilization                    | gauge   | Percent      | Level, DeviceId, Hostname           | GPU utilization on host, DeviceId                                           |
+
+## Backend Metrics
+
+| Metric Name                       | Type  | Unit | Dimensions                 | Semantics                     |
+|-----------------------------------|-------|------|----------------------------|-------------------------------|
+| HandlerTime                       | gauge | ms   | ModelName, Level, Hostname | Time spent in backend handler |
+| PredictionTime                    | gauge | ms   | ModelName, Level, Hostname | Backend prediction time       |
 
 ## Formatting
 
 TorchServe emits metrics to log files by default. The metrics are formatted in a [StatsD](https://github.com/etsy/statsd) like format.
 
 ```bash
-CPUUtilization.Percent:0.0|#Level:Host|#hostname:my_machine_name
-MemoryUsed.Megabytes:13840.328125|#Level:Host|#hostname:my_machine_name
+CPUUtilization.Percent:0.0|#Level:Host|#hostname:my_machine_name,timestamp:1682098185
+DiskAvailable.Gigabytes:318.0416717529297|#Level:Host|#hostname:my_machine_name,timestamp:1682098185
 ```
 
 To enable metric logging in JSON format, set "patternlayout" as "JSONPatternLayout" in [log4j2.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/log4j2.xml) (See sample [log4j2-json.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/test/resources/log4j2-json.xml)). For information, see [Logging in Torchserve](https://github.com/pytorch/serve/blob/master/docs/logging.md).
@@ -121,7 +140,7 @@ class MetricTypes(enum.Enum):
 
 ## Central metrics YAML file definition
 
-TorchServe defines metrics in a [metrics_default.yaml](https://github.com/pytorch/serve/blob/master/frontend/server/src/test/resources/metrics_default.yaml)
+TorchServe defines metrics in a [yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml)
 file, including both frontend metrics (i.e. `ts_metrics`) and backend metrics (i.e. `model_metrics`).
 When TorchServe is started, the metrics definition is loaded in the frontend and backend cache separately.
 The backend flushes the metrics cache once a load model or inference request is completed.
@@ -131,8 +150,6 @@ Dynamic updates between the frontend and backend are _not_ currently being handl
 The `metrics.yaml` is formatted with Prometheus metric type terminology:
 
 ```yaml
-mode: prometheus
-
 dimensions: # dimension aliases
   - &model_name "ModelName"
   - &level "Level"
@@ -170,7 +187,7 @@ model_metrics:  # backend metrics
 ```
 
 
-These are the default metrics within the yaml file, but the user can either delete them to their liking / ignore them altogether, because these metrics will not be emitted unless they are edited.
+Default metrics are provided in the [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file, but the user can either delete them to their liking / ignore them altogether, because these metrics will not be emitted unless they are edited.
 
 
 ### How it works
@@ -184,7 +201,7 @@ parse the backend metrics from the yaml file.*
 
 ### User Manual - starting TorchServe with a yaml file specified
 
-1. Create a `metrics.yaml` file to parse metrics from OR utilize default [metrics_default.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml)
+1. Create a `metrics.yaml` file to parse metrics from OR utilize default [metrics.yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml)
 
 
 2. Set `metrics_config` argument equal to the yaml file path in the `config.properties` being used:
@@ -207,7 +224,7 @@ parse the backend metrics from the yaml file.*
 
 ## Custom Metrics API
 
-TorchServe enables the custom service code to emit metrics that are then logged by the system.
+TorchServe enables the custom service code to emit metrics that are then made available based on the configured `metrics_mode`.
 
 The custom service code is provided with a [context](https://github.com/pytorch/serve/blob/master/ts/context.py) of the current request with a metrics object:
 
@@ -235,7 +252,7 @@ metrics.add_counter("CounterMetric", value=1, dimensions=[Dimension("name", "val
 
 
 ### Updating Metrics parsed from the yaml file
-Given the Metrics API, users will also be able to update metrics that have been parsed from the [yaml](https://github.com/pytorch/serve/blob/master/frontend/server/src/test/resources/metrics_default.yaml) file
+Given the Metrics API, users will also be able to update metrics that have been parsed from the [yaml](https://github.com/pytorch/serve/blob/master/ts/configs/metrics.yaml) file
 given some criteria:
 
 (we will use the following metric as an example)
@@ -551,7 +568,8 @@ class ExampleCustomHandler(BaseHandler, ABC):
 context.metrics.add_counter(...)
 ```
 
-This custom metrics information is logged in the `model_metrics.log` file configured through [log4j2.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/log4j2.xml) file.
+This custom metrics information is logged in the `model_metrics.log` file configured through [log4j2.xml](https://github.com/pytorch/serve/blob/master/frontend/server/src/main/resources/log4j2.xml) file
+or made available via the [metrics](https://github.com/pytorch/serve/blob/master/docs/metrics_api.md) API endpoint based on the `metrics_mode` configuration.
 
 ## Metrics YAML File Parsing and Metrics API Custom Handler Example
 
diff --git a/docs/metrics_api.md b/docs/metrics_api.md
index edbee43a74..c1ca10c2d5 100644
--- a/docs/metrics_api.md
+++ b/docs/metrics_api.md
@@ -1,38 +1,83 @@
 # Metrics API
 
-Metrics API is listening on port 8082 and only accessible from localhost by default. To change the default setting, see [TorchServe Configuration](configuration.md). The default metrics endpoint returns Prometheus formatted metrics. You can query metrics using curl requests or point a [Prometheus Server](#prometheus-server) to the endpoint and use [Grafana](#grafana) for dashboards.
+Metrics API is listening on port 8082 and only accessible from localhost by default. To change the default setting, see [TorchServe Configuration](configuration.md). The default metrics endpoint returns Prometheus formatted metrics when [metrics_mode](https://github.com/pytorch/serve/blob/master/docs/metrics.md) configuration is set to `prometheus`. You can query metrics using curl requests or point a [Prometheus Server](#prometheus-server) to the endpoint and use [Grafana](#grafana) for dashboards.
 
-By default these APIs are enable however same can be disabled by setting `enable_metrics_api=false` in torchserve config.properties file.
+By default these APIs are enabled however same can be disabled by setting `enable_metrics_api=false` in torchserve config.properties file.
 For details refer [Torchserve config](configuration.md) docs.
 
 ```console
 curl http://127.0.0.1:8082/metrics
 
-# HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
-# TYPE ts_inference_latency_microseconds counter
-ts_inference_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noopversioned",model_version="1.11",} 1990.348
-ts_inference_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noop",model_version="default",} 2032.411
-# HELP ts_inference_requests_total Total number of inference requests.
+# HELP Requests5XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests5XX counter
+# HELP DiskUsage Torchserve prometheus gauge metric with unit: Gigabytes
+# TYPE DiskUsage gauge
+DiskUsage{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 20.054508209228516
+# HELP GPUUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE GPUUtilization gauge
+# HELP PredictionTime Torchserve prometheus gauge metric with unit: ms
+# TYPE PredictionTime gauge
+PredictionTime{ModelName="resnet18",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 83.13
+# HELP WorkerLoadTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE WorkerLoadTime gauge
+WorkerLoadTime{WorkerName="W-9000-resnet18_1.0",Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 4593.0
+WorkerLoadTime{WorkerName="W-9001-resnet18_1.0",Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 4592.0
+# HELP MemoryAvailable Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE MemoryAvailable gauge
+MemoryAvailable{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 5829.7421875
+# HELP GPUMemoryUsed Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE GPUMemoryUsed gauge
+# HELP ts_inference_requests_total Torchserve prometheus counter metric with unit: Count
 # TYPE ts_inference_requests_total counter
-ts_inference_requests_total{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noopversioned",model_version="1.11",} 1.0
-ts_inference_requests_total{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noop",model_version="default",} 1.0
-# HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
+ts_inference_requests_total{model_name="resnet18",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 3.0
+# HELP GPUMemoryUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE GPUMemoryUtilization gauge
+# HELP HandlerTime Torchserve prometheus gauge metric with unit: ms
+# TYPE HandlerTime gauge
+HandlerTime{ModelName="resnet18",Level="Model",Hostname="88665a372f4b.ant.amazon.com",} 82.93
+# HELP ts_inference_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
+# TYPE ts_inference_latency_microseconds counter
+ts_inference_latency_microseconds{model_name="resnet18",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 290371.129
+# HELP CPUUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE CPUUtilization gauge
+CPUUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 0.0
+# HELP MemoryUsed Torchserve prometheus gauge metric with unit: Megabytes
+# TYPE MemoryUsed gauge
+MemoryUsed{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 8245.62109375
+# HELP QueueTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE QueueTime gauge
+QueueTime{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 0.0
+# HELP ts_queue_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
 # TYPE ts_queue_latency_microseconds counter
-ts_queue_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noopversioned",model_version="1.11",} 364.884
-ts_queue_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noop",model_version="default",} 82.349
+ts_queue_latency_microseconds{model_name="resnet18",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 365.21
+# HELP DiskUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE DiskUtilization gauge
+DiskUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 5.8
+# HELP Requests2XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests2XX counter
+Requests2XX{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 8.0
+# HELP Requests4XX Torchserve prometheus counter metric with unit: Count
+# TYPE Requests4XX counter
+# HELP WorkerThreadTime Torchserve prometheus gauge metric with unit: Milliseconds
+# TYPE WorkerThreadTime gauge
+WorkerThreadTime{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 1.0
+# HELP DiskAvailable Torchserve prometheus gauge metric with unit: Gigabytes
+# TYPE DiskAvailable gauge
+DiskAvailable{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 325.05113983154297
+# HELP MemoryUtilization Torchserve prometheus gauge metric with unit: Percent
+# TYPE MemoryUtilization gauge
+MemoryUtilization{Level="Host",Hostname="88665a372f4b.ant.amazon.com",} 64.4
 ```
 
 ```console
 curl "http://127.0.0.1:8082/metrics?name[]=ts_inference_latency_microseconds&name[]=ts_queue_latency_microseconds" --globoff
 
-# HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
-# TYPE ts_inference_latency_microseconds counter
-ts_inference_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noopversioned",model_version="1.11",} 1990.348
-ts_inference_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noop",model_version="default",} 2032.411
-# HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
+# HELP ts_queue_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
 # TYPE ts_queue_latency_microseconds counter
-ts_queue_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noopversioned",model_version="1.11",} 364.884
-ts_queue_latency_microseconds{uuid="d5f84dfb-fae8-4f92-b217-2f385ca7470b",model_name="noop",model_version="default",} 82.349
+ts_queue_latency_microseconds{model_name="resnet18",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 365.21
+# HELP ts_inference_latency_microseconds Torchserve prometheus counter metric with unit: Microseconds
+# TYPE ts_inference_latency_microseconds counter
+ts_inference_latency_microseconds{model_name="resnet18",model_version="default",hostname="88665a372f4b.ant.amazon.com",} 290371.129
 ```
 
 #### Prometheus server
@@ -52,15 +97,15 @@ scrape_configs:
     static_configs:
     - targets: ['localhost:8082'] #TorchServe metrics endpoint
 ```
-Navigate to `http://localhost:9090/` on a browser to execute queries and create graphs 
+Navigate to `http://localhost:9090/` on a browser to execute queries and create graphs
 
-<img width="1231" alt="PrometheusServer" src="https://user-images.githubusercontent.com/880376/86984450-806fc680-c143-11ea-9ae2-f2ef42f24f4c.png">
+<img width="1231" alt="Prometheus Server" src="https://user-images.githubusercontent.com/5276346/234722761-007e168a-ebc0-4644-be60-23b2f33fa4f2.png">
 
 #### Grafana
 
 Once you have the Torchserve and Prometheus servers running, you can further [setup](https://prometheus.io/docs/visualization/grafana/) Grafana, point it to Prometheus server and navigate to `http://localhost:3000/` to create dashboards and graphs.
 
-You can use command given below to start Grafana - 
+You can use command given below to start Grafana -
 `sudo systemctl daemon-reload && sudo systemctl enable grafana-server && sudo systemctl start grafana-server`
 
-<img width="1220" alt="Screen Shot 2020-07-08 at 5 51 57 PM" src="https://user-images.githubusercontent.com/880376/86984550-c4fb6200-c143-11ea-9434-09d4d43dd6d4.png">
+<img width="1220" alt="Grafana Dashboard" src="https://user-images.githubusercontent.com/5276346/234725829-7f60e0d8-c76d-4019-ac8f-7d60069c4e58.png">
diff --git a/docs/mps.md b/docs/mps.md
new file mode 100644
index 0000000000..4b10048435
--- /dev/null
+++ b/docs/mps.md
@@ -0,0 +1,91 @@
+# Running TorchServe with NVIDIA MPS
+In order to deploy ML models, TorchServe spins up each worker in a separate processes, thus isolating each worker from the others.
+Each process creates its own CUDA context to execute its kernels and access the allocated memory.
+
+While NVIDIA GPUs in their default setting allow multiple processes to run CUDA kernels on a single device it involves the following drawback:
+* The execution of the kernels is generally serialized
+* Each processes creates its own CUDA context which occupies additional GPU memory
+
+For these scenarios NVIDIA offers the Multi-Process Service (MPS) which:
+* Allows multiple processes to share the same CUDA context on the same GPU
+* Run their kernels in a parallel fashion
+
+This can result in:
+* Increased performance when using multiple workers on the same GPU
+* Decreased GPU memory utilization due to the shared context
+
+
+To leverage the benefits of NVIDIA MPS we need to start the MPS daemon with the following commands before starting up TorchServe itself.
+```
+sudo nvidia-smi -c 3
+nvidia-cuda-mps-control -d
+```
+The first command enables the exclusive processing mode for the GPU allowing only one process (the MPS daemon) to utilize it.
+The second command starts the MPS daemon itself.
+To shutdown the daemon we can execute:
+```
+echo quit | nvidia-cuda-mps-control
+```
+For more details on MPS please refer to [NVIDIA's MPS documentation](https://docs.nvidia.com/deploy/mps/index.html).
+It should be noted that MPS only allows 48 processes (for Volta GPUs) to connect to the daemon due limited hardware resources.
+Adding more clients/workers (to the same GPU) will lead to a failure.
+
+## Benchmarks
+To show the performance of TorchServe with activated MPS and help to the decision in enabling MPS for your deployment or not we will perform some benchmarks with representative workloads.
+
+Primarily, we want to investigate how the throughput of a worker evolves with activated MPS for different operation points.
+As an example work load for our benchmark we select the [HuggingFace Transformers Sequence Classification example](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers#sequence-classification).
+We perform the benchmark on a g4dn.4xlarge as well as a p3.2xlarge instance on AWS.
+Both instance types provide one GPU per instance which will result in multiple workers to be scheduled on the same GPU.
+For the benchmark we concentrate on the model throughput as measured by the [benchmark-ab.py](https://github.com/pytorch/serve/tree/master/benchmarks/benchmark-ab.py) tool.
+
+First, we measure the throughput of a single worker for different batch sizes as it will show us at which point the compute resources of the GPU are fully occupied.
+Second, we measure the throughput with two deployed workers for the batch sizes where we expect the GPUs to have still some resources left over to share.
+For each benchmark we perform five runs and take the median over the runs.
+
+We use the following config.json for the benchmark, only overwriting the number of workers and the batch size accordingly.
+
+```
+{
+    "url":"/home/ubuntu/serve/examples/Huggingface_Transformers/model_store/BERTSeqClassification",
+    "requests": 10000,
+    "concurrency": 600,
+    "input": "/home/ubuntu/serve/examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text_captum_input.txt",
+    "workers": "1"
+}
+```
+Please note that we set the concurrency level to 600 which will make sure that the batch aggregation inside TorchServe fills up the batches to the maximum batch size. But concurrently this will skew the latency measurements as many requests will be waiting in the queue to be processed. We will therefore neglect the latency measurements in the following.
+
+### G4 Instance
+We first perform the single worker benchmark for the G4 instance.
+In the figure below we see that up to a batch size of four we see a steady increase of the throughput over the batch size.
+
+![G4 benchmark, single worker](images/mps_g4_single.png)
+
+Next, we increase the number of workers to two in order to compare the throughput with and without MPS running.
+To enable MPS for the second set of runs we first set the exclusive processing mode for the GPU and then start the MPS daemon as shown above.
+
+We select the batch size between one and eight according to our previous findings.
+In the figure we can see that the performance in terms of throughput can be better in case of batch size 1 and 8 (up to +18%) while it can be worse for others (-11%).
+An interpretation of this result could be that the G4 instance has not many resources to share when we run a BERT model in one of the workers.
+
+![G4 benchmark, two workers](images/mps_g4_two_worker.png)
+
+### P3 instance
+Next, we will run the same experiment with the bigger p3.2xlarge instance.
+With a single worker we get the following throughput values:
+
+![P3 benchmark, single worker](images/mps_p3_single.png)
+
+We can see that the throughput steady increases but for a batch size over eight we see diminishing returns.
+Finally, we deploy two workers on the P3 instance and compare running them with and without MPS.
+We can see that for batch size between 1 and 32 the throughput is consistently higher (up to +25%) for MPS enabled with the exception of batch size 16.
+
+![P3 benchmark, two workers](images/mps_p3_two_worker.png)
+
+## Summary
+In the previous section we saw that by enabling MPS for two workers running the same model we receive mixed results.
+For the smaller G4 instance we only saw benefits in certain operation points while we saw more consistent improvements for the bigger P3 instance.
+This suggests that the benefit in terms of throughput for running a deployment with MPS are highly workload and environment dependent and need to be determined for specific situations using appropriate benchmarks and tools.
+It should be noted that the previous benchmark solely focused on throughput and neglected latency and memory footprint.
+As using MPS will only create a single CUDA context more workers can be packed to the same GPU which needs to be considered as well in the according scenarios.
diff --git a/docs/performance_guide.md b/docs/performance_guide.md
index aa0451b156..b22be3f7e2 100644
--- a/docs/performance_guide.md
+++ b/docs/performance_guide.md
@@ -1,4 +1,4 @@
-# Performance Guide
+# [Performance Guide](#performance-guide)
 In case you're interested in optimizing the memory usage, latency or throughput of a PyTorch model served with TorchServe, this is the guide for you.
 ## Optimizing PyTorch
 There are many tricks to optimize PyTorch models for production including but not limited to distillation, quantization, fusion, pruning, setting environment variables and we encourage you to benchmark and see what works best for you. An experimental tool that may make this process easier is https://pypi.org/project/torchprep.
@@ -9,7 +9,7 @@ In general it's hard to optimize models and the easiest approach can be exportin
 
 `pip install torchserve[onnx]`
 
-In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differentyl from a regular PyTorch model in that when you're running the conversion you need to explicity set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
+In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differently from a regular PyTorch model in that when you're running the conversion you need to explicitly set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
 1. Package serialized ONNX weights `torch-model-archiver --serialized-file model.onnx ...`
 2. Load those weights from `base_handler.py` using `ort_session = ort.InferenceSession(self.model_pt_path, providers=providers, sess_options=sess_options)` which supports reasonable defaults for both CPU and GPU inference
 3. Allow you define custom pre and post processing functions to pass in data in the format your onnx model expects with a custom handler
diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py
index 821e89cc98..1ae3c6fd55 100644
--- a/examples/Huggingface_Transformers/Download_Transformer_models.py
+++ b/examples/Huggingface_Transformers/Download_Transformer_models.py
@@ -20,7 +20,14 @@
 
 
 def transformers_model_dowloader(
-    mode, pretrained_model_name, num_labels, do_lower_case, max_length, torchscript
+    mode,
+    pretrained_model_name,
+    num_labels,
+    do_lower_case,
+    max_length,
+    torchscript,
+    hardware,
+    batch_size,
 ):
     """This function, save the checkpoint, config file along with tokenizer config and vocab files
     of a transformer model of your choice.
@@ -98,11 +105,44 @@ def transformers_model_dowloader(
             add_special_tokens=True,
             return_tensors="pt",
         )
-        input_ids = inputs["input_ids"].to(device)
-        attention_mask = inputs["attention_mask"].to(device)
         model.to(device).eval()
-        traced_model = torch.jit.trace(model, (input_ids, attention_mask))
-        torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt"))
+        if hardware == "neuron":
+            import torch_neuron
+
+            input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+            attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+                device
+            )
+            traced_model = torch_neuron.trace(model, (input_ids, attention_mask))
+            torch.jit.save(
+                traced_model,
+                os.path.join(
+                    NEW_DIR,
+                    "traced_{}_model_neuron_batch_{}.pt".format(model_name, batch_size),
+                ),
+            )
+        elif hardware == "neuronx":
+            import torch_neuronx
+
+            input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+            attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+                device
+            )
+            traced_model = torch_neuronx.trace(model, (input_ids, attention_mask))
+            torch.jit.save(
+                traced_model,
+                os.path.join(
+                    NEW_DIR,
+                    "traced_{}_model_neuronx_batch_{}.pt".format(
+                        model_name, batch_size
+                    ),
+                ),
+            )
+        else:
+            input_ids = inputs["input_ids"].to(device)
+            attention_mask = inputs["attention_mask"].to(device)
+            traced_model = torch.jit.trace(model, (input_ids, attention_mask))
+            torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt"))
     return
 
 
@@ -124,7 +164,16 @@ def transformers_model_dowloader(
         torchscript = True
     else:
         torchscript = False
+    hardware = settings.get("hardware")
+    batch_size = int(settings.get("batch_size", "1"))
 
     transformers_model_dowloader(
-        mode, model_name, num_labels, do_lower_case, max_length, torchscript
+        mode,
+        model_name,
+        num_labels,
+        do_lower_case,
+        max_length,
+        torchscript,
+        hardware,
+        batch_size,
     )
diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md
index a11b7ec919..0c0679d62c 100644
--- a/examples/Huggingface_Transformers/README.md
+++ b/examples/Huggingface_Transformers/README.md
@@ -1,6 +1,6 @@
 ## Serving Huggingface Transformers using TorchServe
 
-In this example, we show how to serve a fine tuned or off the shelf Transformer model from [huggingface](https://huggingface.co/transformers/index.html) using TorchServe. 
+In this example, we show how to serve a fine tuned or off the shelf Transformer model from [huggingface](https://huggingface.co/transformers/index.html) using TorchServe.
 
 We use a custom handler, [Transformer_handler.py](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/Transformer_handler_generalized.py).
 
@@ -10,7 +10,7 @@ We borrowed ideas to write a custom handler for transformers from tutorial prese
 
 To get started [install Torchserve](https://github.com/pytorch/serve) and then
 
- `pip install transformers==4.6.0`
+ `pip install -r requirements.txt`
 
 ### Objectives
 1. How to package a transformer into a torch model archive (.mar) file (eager mode or Torchscript) with `torch-model-archiver`
@@ -51,21 +51,25 @@ In the setup_config.json :
 
 *embedding_name* : The name of embedding layer in the chosen model, this could be `bert` for `bert-base-uncased`, `roberta` for `roberta-base` or `roberta` for `xlm-roberta-large`, or `gpt2` for `gpt2` model
 
+*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/) and `neuronx` for [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/).
+
+*batch_size* : Input batch size when tracing the model for `neuron` or `neuronx` as target hardware.
+
 Once, `setup_config.json` has been set properly, the next step is to run
 
 `python Download_Transformer_models.py`
 
-This produces all the required files for packaging using a huggingface transformer model off-the-shelf without fine-tuning process. Using this option will create and saved the required files into Transformer_model directory. 
+This produces all the required files for packaging using a huggingface transformer model off-the-shelf without fine-tuning process. Using this option will create and saved the required files into Transformer_model directory.
 
 
 #### Setting the extra_files
 
-There are few files that are used for model packaging and at the inference time. 
+There are few files that are used for model packaging and at the inference time.
 * `index_to_name.json`: maps predictions to labels
 * `sample_text.txt`: input text for inference
 * `vocab.txt`: by default will use the tokenizer from the pretrained model
 
-For custom vocabs, it is required to pass all other tokenizer related files such `tokenizer_config.json`, `special_tokens_map.json`, `config.json` and if available `merges.txt`. 
+For custom vocabs, it is required to pass all other tokenizer related files such `tokenizer_config.json`, `special_tokens_map.json`, `config.json` and if available `merges.txt`.
 
 For examples of how to configure a model for a use case and what the input format should look like
 * Model configuration: `Transformer_model` directory after running `python Download_Transformer_models.py`
@@ -278,7 +282,7 @@ For batch inference the main difference is that you need set the batch size whil
     mv BERTSeqClassification.mar model_store/
     torchserve --start --model-store model_store --ts-config config.properties --models BERTSeqClassification= BERTSeqClassification.mar
 
-    ```   
+    ```
 Now to run the batch inference following command can be used:
 
 ```
@@ -293,7 +297,7 @@ curl -X POST http://127.0.0.1:8080/predictions/BERTSeqClassification  -T ./Seq_c
 
 The [Captum Explanations for Visual Insights Notebook](https://github.com/pytorch/serve/tree/master/examples/captum/Captum_visualization_for_bert.ipynb) provides a visual example for how model interpretations can help
 
-Known issues: 
+Known issues:
 * Captum does't work well for batched inputs and may result in timeouts
 * No support for torchscripted models
 
@@ -311,14 +315,13 @@ When a json file is passed as a request format to the curl, Torchserve unwraps t
 
 In the setup_config.json, specify `"BetterTransformer":true,`.
 
-Note: make sure to install [HuggingFace Optimum] `pip install optimum`
 
 [Better Transformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) from PyTorch is integrated into [Huggingface Optimum](https://huggingface.co/docs/optimum/bettertransformer/overview) that bring major speedups for many of encoder models on different modalities (text, image, audio). It is a one liner API that we have also added in the `Transformer_handler_generalized.py` in this example as well. That as shown above you just need to set `"BetterTransformer":true,` in the setup_config.json.
 
 Main speed ups in the Better Transformer comes from kernel fusion in the [TransformerEncoder] (https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html) and making use of sparsity with [nested tensors](https://pytorch.org/tutorials/prototype/nestedtensor.html) when input sequences are padded to avoid unnecessary computation on padded tensors. We have seen up to 4.5x speed up with distill_bert when used higher batch sizes with padding. Please read more about it in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). You get some speedups even with Batch size = 1 and no padding however, major speed ups will show up when running inference with higher batch sizes (8.16,32) with padding.
 
 
-## Model Parallelism 
+## Model Parallelism
 
 [Parallelize] (https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Model.parallelize) is a an experimental feature that HuggingFace recently added to support large model inference for some very large models, GPT2 and T5. GPT2 model choices based on their size are gpt2-medium, gpt2-large, gpt2-xl. This feature only supports LMHeadModel that could be used for text generation, other application such as sequence, token classification and question answering are not supported. We have added parallelize support for GPT2 model in the custom handler in this example that will enable you to perform model parallel inference for GPT2 models used for text generation. The same logic in the handler can be extended to T5 and the applications it supports. Make sure that you register your model with one worker using this feature. To run this example, a machine with #gpus > 1 is required. The number of required gpus depends on the size of the model. This feature only supports single node, one machine with multi-gpus.
 
@@ -356,7 +359,7 @@ To register the model on TorchServe using the above model archive file, we run t
 ```
 mkdir model_store
 mv Textgeneration.mar model_store/
-torchserve --start --model-store model_store 
+torchserve --start --model-store model_store
 curl -X POST "localhost:8081/models?model_name=Textgeneration&url=Textgeneration.mar&batch_size=1&max_batch_delay=5000&initial_workers=1&synchronous=true"
 ```
 
diff --git a/examples/Huggingface_Transformers/Transformer_handler_generalized.py b/examples/Huggingface_Transformers/Transformer_handler_generalized.py
index b469311464..61893d87e4 100644
--- a/examples/Huggingface_Transformers/Transformer_handler_generalized.py
+++ b/examples/Huggingface_Transformers/Transformer_handler_generalized.py
@@ -83,14 +83,10 @@ def initialize(self, ctx):
                 logger.warning("Missing the operation mode.")
             # Using the Better Transformer integration to speedup the inference
             if self.setup_config["BetterTransformer"]:
-                try:
-                    from optimum.bettertransformer import BetterTransformer
+                from optimum.bettertransformer import BetterTransformer
 
+                try:
                     self.model = BetterTransformer.transform(self.model)
-                except ImportError as error:
-                    logger.warning(
-                        "HuggingFace Optimum is not installed. Proceeding without BetterTransformer"
-                    )
                 except RuntimeError as error:
                     logger.warning(
                         "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
@@ -377,7 +373,6 @@ def get_insights(self, input_batch, text, target):
             self.setup_config["mode"] == "sequence_classification"
             or self.setup_config["mode"] == "token_classification"
         ):
-
             attributions, delta = self.lig.attribute(
                 inputs=input_ids,
                 baselines=ref_input_ids,
diff --git a/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py b/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py
new file mode 100644
index 0000000000..c74ac9e9b4
--- /dev/null
+++ b/examples/Huggingface_Transformers/Transformer_handler_generalized_neuron.py
@@ -0,0 +1,32 @@
+import os
+
+import torch
+from Transformer_handler_generalized import TransformersSeqClassifierHandler
+
+if "NEURON_RT_NUM_CORES" not in os.environ:
+    os.environ["NEURON_RT_NUM_CORES"] = "1"
+
+
+class TransformersSeqClassifierNeuronHandler(TransformersSeqClassifierHandler):
+    def inference(self, input_batch):
+        """Predict the class (or classes) of the received text using the
+        serialized transformers checkpoint.
+        Args:
+            input_batch (list): List of Text Tensors from the pre-process function is passed here
+        Returns:
+            list : It returns a list of the predicted value for the input text
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        num_inferences = len(input_ids_batch)
+        batch_size = int(self.setup_config.get("batch_size", "1"))
+
+        # insert padding if a partial batch was received
+        padding = batch_size - num_inferences
+        if padding > 0:
+            pad = torch.nn.ConstantPad1d((0, 0, 0, padding), value=0)
+            input_ids_batch = pad(input_ids_batch)
+            attention_mask_batch = pad(attention_mask_batch)
+
+        inferences = super().inference((input_ids_batch, attention_mask_batch))
+
+        return inferences[:num_inferences]
diff --git a/examples/Huggingface_Transformers/requirements.txt b/examples/Huggingface_Transformers/requirements.txt
new file mode 100644
index 0000000000..196e970d9a
--- /dev/null
+++ b/examples/Huggingface_Transformers/requirements.txt
@@ -0,0 +1,2 @@
+transformers
+optimum
diff --git a/examples/README.md b/examples/README.md
index 1bafd53e58..9dec94c386 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,10 +1,11 @@
-# Examples showcasing TorchServe Features and Integrations
+# [Examples showcasing TorchServe Features and Integrations](#torchserve-internals)
 
 ## TorchServe Internals
 
 * [Creating mar file for an eager mode model](#creating-mar-file-for-eager-mode-model)
 * [Creating mar file for torchscript mode model](#creating-mar-file-for-torchscript-mode-model)
 * [Serving custom model with custom service handler](#serving-custom-model-with-custom-service-handler)
+* [Serving model using Docker Container](image_classifier/mnist/Docker.md)
 * [Creating a Workflow](Workflows/dog_breed_classification)
 * [Custom Metrics](custom_metrics)
 * [Dynamic Batch Processing](image_classifier/resnet_152_batch)
@@ -24,6 +25,8 @@
 
 * [Serving HuggingFace transformers model](Huggingface_Transformers)
 
+### PiPPy [Serving Large Models with PyTorch Native Solution PiPPy](large_models/Huggingface_pippy/Readme.md)
+
 ### MLFlow <img src="images/mlflow.png" width="50"  title="MLFlow" style="float:right padding:20px" />
 
 * [Deploy models using `mlflow-torchserve` plugin](https://github.com/mlflow/mlflow-torchserve/tree/master/examples)
@@ -42,7 +45,7 @@
 
 ### Microsoft DeepSpeed-MII <img src="images/mii-white.svg" width="80" title="DeepSpeed MII" style="float:top" />
 
-* [HuggingFace Stable Diffusion Model with Microsoft DeepSpeed-MII](deepspeed_mii)
+* [HuggingFace Stable Diffusion Model with Microsoft DeepSpeed-MII](large_models/deepspeed_mii/Readme.md)
 
 ### Prometheus and mtail <img src="images/prometheus-logo.svg" width="30" title="Prometheus" style="float:top" />
 
@@ -65,8 +68,8 @@
 ### Stable Diffusion <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
 * [Stable Diffusion using HuggingFace Diffusers](diffusers)
 
-### HuggingFace Large Models <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
-* [HuggingFace Large Models with constrained resources](Huggingface_Largemodels)
+### HuggingFace Large Models with Accelerate <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
+* [HuggingFace Large Models with constrained resources](large_models/Huggingface_accelerate/Readme.md)
 
 ## UseCases
 
@@ -109,7 +112,7 @@ Following are the steps to create a torch-model-archive (.mar) to execute an eag
 
 * Pre-requisites to create a torch model archive (.mar) :
     * serialized-file (.pt) : This file represents the `state_dict` in case of eager mode model.
-    * model-file (.py) : This file contains model class extended from `torch nn`.modules representing the model architecture. This parameter is mandatory for eager mode models. This file must contain only one class definition extended from torch.nn.modules
+    * model-file (.py) : This file contains model class extended from `torch nn`.modules representing the model architecture. This parameter is mandatory for eager mode models. This file must contain only one class definition extended from [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html).
     * index_to_name.json : This file contains the mapping of predicted index to class. The default TorchServe handles returns the predicted index and probability. This file can be passed to model archiver using --extra-files parameter.
     * version : Model's version.
     * handler : TorchServe default handler's name or path to custom inference handler(.py)
diff --git a/examples/Workflows/README.md b/examples/Workflows/README.md
index ea07a487ed..b08ab3ecb9 100644
--- a/examples/Workflows/README.md
+++ b/examples/Workflows/README.md
@@ -1,4 +1,4 @@
-# Workflow examples
+# [Workflow examples](#workflow-examples)
 
 Workflows can be used to compose an ensemble of Pytorch models and Python functions and package them in a `war` file. A workflow is executed as a DAG where the nodes can be either Pytorch models packaged as `mar` files or function nodes specified in the workflow handler file. The DAG can be used to define both sequential or parallel pipelines.
 
@@ -8,7 +8,7 @@ As an example a sequential pipeline may look something like
 input -> function1 -> model1 -> model2 -> function2 -> output
 ```
 
-And a parallel pipeline may look something like 
+And a parallel pipeline may look something like
 
 ```
                           model1
diff --git a/examples/cloudformation/README.md b/examples/cloudformation/README.md
index ac460d7309..1150bdc427 100644
--- a/examples/cloudformation/README.md
+++ b/examples/cloudformation/README.md
@@ -66,13 +66,13 @@ aws cloudformation create-stack \
 > curl --insecure "<TorchServeMericsURL>/metrics"
 # HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
 # TYPE ts_queue_latency_microseconds counter
-ts_queue_latency_microseconds{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 364.07800000000003
+ts_queue_latency_microseconds{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 364.07800000000003
 # HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
 # TYPE ts_inference_latency_microseconds counter
-ts_inference_latency_microseconds{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 128010.02100000001
+ts_inference_latency_microseconds{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 128010.02100000001
 # HELP ts_inference_requests_total Total number of inference requests.
 # TYPE ts_inference_requests_total counter
-ts_inference_requests_total{uuid="e275b494-3d54-45bd-a640-abca741a070b",model_name="squeezenet1_1",model_version="default",} 4.0
+ts_inference_requests_total{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 4.0
 ```
 
 
@@ -97,7 +97,7 @@ aws cloudformation create-stack \
                ParameterKey=ModelPath,ParameterValue=<model-mar-url>
 ```
 
-e.g. 
+e.g.
 ```
 aws cloudformation create-stack \
   --stack-name torchserve \
@@ -149,13 +149,13 @@ aws cloudformation create-stack \
 > curl "<TorchServeMericsURL>/metrics"
 # HELP ts_queue_latency_microseconds Cumulative queue duration in microseconds
 # TYPE ts_queue_latency_microseconds counter
-ts_queue_latency_microseconds{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 932.164
+ts_queue_latency_microseconds{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 932.164
 # HELP ts_inference_latency_microseconds Cumulative inference duration in microseconds
 # TYPE ts_inference_latency_microseconds counter
-ts_inference_latency_microseconds{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 411702.625
+ts_inference_latency_microseconds{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 411702.625
 # HELP ts_inference_requests_total Total number of inference requests.
 # TYPE ts_inference_requests_total counter
-ts_inference_requests_total{uuid="2b3a4b5b-5131-413a-a725-2abcae5d55ab",model_name="squeezenet1_1",model_version="default",} 9.0
+ts_inference_requests_total{model_name="squeezenet1_1",model_version="default",hostname="test_host",} 9.0
 ```
 
 ## CloudWatch Logging
diff --git a/examples/image_classifier/mnist/Docker.md b/examples/image_classifier/mnist/Docker.md
new file mode 100644
index 0000000000..ffd690e85e
--- /dev/null
+++ b/examples/image_classifier/mnist/Docker.md
@@ -0,0 +1,54 @@
+# Digit recognition model with MNIST dataset using Docker container
+
+In this example, we show how to use a pre-trained custom MNIST model to performing real time Digit recognition with TorchServe.
+We will be serving the model using a Docker container.
+
+The inference service would return the digit inferred by the model in the input image.
+
+We used the following pytorch example to train the basic MNIST model for digit recognition :
+https://github.com/pytorch/examples/tree/master/mnist
+
+## Serve an MNIST model on TorchServe docker container
+
+Run the commands given in following steps from the parent directory of the root of the repository. For example, if you cloned the repository into /home/my_path/serve, run the steps from /home/my_path/serve
+
+ ### Create a torch model archive using the torch-model-archiver utility to archive the above files.
+
+    ```bash
+    torch-model-archiver --model-name mnist --version 1.0 --model-file examples/image_classifier/mnist/mnist.py --serialized-file examples/image_classifier/mnist/mnist_cnn.pt --handler  examples/image_classifier/mnist/mnist_handler.py
+    ```
+
+  ### Move .mar file into model_store directory
+
+    ```bash
+    mkdir model_store
+    mv mnist.mar model_store/
+    ```
+
+  ### Start a docker container with torchserve
+
+  ```bash
+  docker run --rm -it -p 8080:8080 -p 8081:8081 -p 8082:8082 -v $(pwd)/model_store:/home/model-server/model-store pytorch/torchserve:latest-cpu
+  ```
+
+  ### Register the model on TorchServe using the above model archive file
+
+  ```bash
+  curl -X POST "localhost:8081/models?model_name=mnist&url=mnist.mar&initial_workers=4"
+  ```
+
+  If this succeeeds, you will see a message like below
+
+  ```bash
+  {
+  "status": "Model \"mnist\" Version: 1.0 registered with 4 initial workers"
+  }
+  ```
+
+  ### Run digit recognition inference outside the container
+
+    ```bash
+    curl http://127.0.0.1:8080/predictions/mnist -T examples/image_classifier/mnist/test_data/0.png
+    ```
+
+   The output in this case will be a `0`
diff --git a/examples/image_classifier/mnist/README.md b/examples/image_classifier/mnist/README.md
index 2116e593dd..6f09e036ad 100644
--- a/examples/image_classifier/mnist/README.md
+++ b/examples/image_classifier/mnist/README.md
@@ -16,7 +16,7 @@ https://github.com/pytorch/examples/tree/master/mnist
 
 Run the commands given in following steps from the parent directory of the root of the repository. For example, if you cloned the repository into /home/my_path/serve, run the steps from /home/my_path
 
- * Step - 1: Create a new model architecture file which contains model class extended from torch.nn.modules. In this example we have created [mnist model file](mnist.py).
+ * Step - 1: Create a new model architecture file which contains model class extended from [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). In this example we have created [mnist model file](mnist.py).
  * Step - 2: Train a MNIST digit recognition model using https://github.com/pytorch/examples/blob/master/mnist/main.py and save the state dict of model. We have added the pre-created [state dict](mnist_cnn.pt) of this model.
  * Step - 3: Write a custom handler to run the inference on your model. In this example, we have added a [custom_handler](mnist_handler.py) which runs the inference on the input grayscale images using the above model and recognizes the digit in the image.
  * Step - 4: Create a torch model archive using the torch-model-archiver utility to archive the above files.
diff --git a/examples/image_classifier/resnet_18/index_to_name.json b/examples/image_classifier/resnet_18/index_to_name.json
new file mode 100644
index 0000000000..5fe0dfefcd
--- /dev/null
+++ b/examples/image_classifier/resnet_18/index_to_name.json
@@ -0,0 +1 @@
+{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
\ No newline at end of file
diff --git a/examples/Huggingface_Largemodels/Download_model.py b/examples/large_models/Huggingface_accelerate/Download_model.py
similarity index 100%
rename from examples/Huggingface_Largemodels/Download_model.py
rename to examples/large_models/Huggingface_accelerate/Download_model.py
diff --git a/examples/Huggingface_Largemodels/Readme.md b/examples/large_models/Huggingface_accelerate/Readme.md
similarity index 100%
rename from examples/Huggingface_Largemodels/Readme.md
rename to examples/large_models/Huggingface_accelerate/Readme.md
diff --git a/examples/Huggingface_Largemodels/config.properties b/examples/large_models/Huggingface_accelerate/config.properties
similarity index 100%
rename from examples/Huggingface_Largemodels/config.properties
rename to examples/large_models/Huggingface_accelerate/config.properties
diff --git a/examples/Huggingface_Largemodels/custom_handler.py b/examples/large_models/Huggingface_accelerate/custom_handler.py
similarity index 100%
rename from examples/Huggingface_Largemodels/custom_handler.py
rename to examples/large_models/Huggingface_accelerate/custom_handler.py
diff --git a/examples/Huggingface_Largemodels/requirements.txt b/examples/large_models/Huggingface_accelerate/requirements.txt
similarity index 100%
rename from examples/Huggingface_Largemodels/requirements.txt
rename to examples/large_models/Huggingface_accelerate/requirements.txt
diff --git a/examples/Huggingface_Largemodels/sample_text.txt b/examples/large_models/Huggingface_accelerate/sample_text.txt
similarity index 100%
rename from examples/Huggingface_Largemodels/sample_text.txt
rename to examples/large_models/Huggingface_accelerate/sample_text.txt
diff --git a/examples/Huggingface_Largemodels/setup_config.json b/examples/large_models/Huggingface_accelerate/setup_config.json
similarity index 100%
rename from examples/Huggingface_Largemodels/setup_config.json
rename to examples/large_models/Huggingface_accelerate/setup_config.json
diff --git a/examples/large_models/Huggingface_pippy/Readme.md b/examples/large_models/Huggingface_pippy/Readme.md
new file mode 100644
index 0000000000..970d0315af
--- /dev/null
+++ b/examples/large_models/Huggingface_pippy/Readme.md
@@ -0,0 +1,89 @@
+# Loading large Huggingface models with PiPPy (PyTorch Native Large inference solution)
+
+This document briefs on serving large HF model with PiPPy.
+
+PiPPy provides pipeline parallelism for serving large models that would not fit into one gpu. It takes your model and splits it into equal sizes (stages) partitioned over the number devices you specify. Then uses micro batching to run your batched input for inference ( its is more optimal for batch sizes >1). Micro-batching is the techniques in pipeline parallelism to maximize gpu utilization.
+
+## How to serve your large HuggingFace models with PiPPy in Torchserve?
+
+We use a Torchserve custom handler that inherits from base_pippy_handler to load the model and define our logic for preprocess, inference and post processing. This is basically very similar to your evaluation process. Following settings has been tested on g5.12xlarge EC2 instance which has 4xA10 GPUs.
+
+To run this example we need to have torchpippy installed. This has been added to the requirement.txt which can be bundled during model packaging.
+
+Generally to install torchpippy you can run following
+
+```bash
+pip install torchpippy
+
+```
+
+### Step 1: Download model
+
+```bash
+python ../utils/Download_model.py --model_name facebook/opt-30b
+```
+The script prints the path where the model is downloaded as below. This is an example and in your workload you want to use your actual trained model checkpoints.
+
+`model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546/`
+
+The downloaded model is around 14GB.
+
+
+### Step 2: Create a model-config.yaml with that include following
+
+```bash
+
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+parallelLevel: 4
+deviceType: "gpu"
+parallelType: "pp" #PiPPy as the solution for distributed inference
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                       # gpus you wish to split your model
+pippy:
+    chunks: 1 # This sets the microbatch sizes, microbatch = batch size/ chunks
+    input_names: ['input_ids'] # input arg names to the model, this is required for FX tracing
+    model_type: "HF" # set the model type to HF if you are using Huggingface model other wise leave it blank or any other model you use.
+    rpc_timeout: 1800
+    num_worker_threads: 512 #number of threads for rpc worker usually 512 is a good number
+
+handler:
+    max_length: 80 # max length of tokens for tokenizer in the handler
+    model_name: "/home/ubuntu/serve/examples/large_models/Huggingface_pippy/model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546" #the path to the checkpoints, in this example downloaded file. Please change to your model path.
+    index_file_name: 'pytorch_model.bin.index.json' # index json file name in the model checkpoint folder, that keeps information of distributed checkpoints
+    manual_seed: 40
+    dtype: fp16 # data type to load your model checkpoint, supported fp32, fp16, bf16
+```
+
+### Step 3: Generate Tar/ MAR file
+
+Navigate up to `largemodels` directory. Here as bundling the large model checkpoints is very time consuming, we are passing model checkpoint path in the model_config.yaml as shown above. This let us make the packaging very fast, for production settings, the large models can be put in some shared location and used from there in the model-config.
+
+```bash
+torch-model-archiver --model-name opt --version 1.0 --handler pippy_handler.py  -r requirements.txt --config-file model-config.yaml --archive-format tgz
+
+```
+
+### Step 4: Add the mar file to model store
+
+```bash
+mkdir model_store
+mv opt.tar.gz model_store
+```
+
+### Step 5: Start torchserve
+
+Update config.properties and start torchserve
+
+```bash
+torchserve --ncs --start --model-store model_store --models opt.tar.gz
+```
+
+### Step 6: Run inference
+
+```bash
+curl -v "http://localhost:8080/predictions/opt" -T sample_text.txt
+```
diff --git a/examples/large_models/Huggingface_pippy/model-config.yaml b/examples/large_models/Huggingface_pippy/model-config.yaml
new file mode 100644
index 0000000000..f3ef7f0299
--- /dev/null
+++ b/examples/large_models/Huggingface_pippy/model-config.yaml
@@ -0,0 +1,25 @@
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 200
+responseTimeout: 300
+parallelType: "pp"
+deviceType: "gpu"
+torchrun:
+    nproc-per-node: 4
+
+#backend settings
+pippy:
+    rpc_timeout: 1800
+    model_type: "HF"
+    chunks: 1
+    input_names: ["input_ids"]
+    num_worker_threads: 128
+
+handler:
+    model_path: "/home/ubuntu/serve/examples/large_models/Huggingface_pippy/model/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546"
+    index_filename: 'pytorch_model.bin.index.json'
+    max_length: 50
+    max_new_tokens: 60
+    manual_seed: 40
+    dtype: fp16
diff --git a/examples/large_models/Huggingface_pippy/pippy_handler.py b/examples/large_models/Huggingface_pippy/pippy_handler.py
new file mode 100644
index 0000000000..6db8d32c41
--- /dev/null
+++ b/examples/large_models/Huggingface_pippy/pippy_handler.py
@@ -0,0 +1,157 @@
+import logging
+import time
+from abc import ABC
+
+import packaging.version
+import requests
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.handler_utils.distributed.pt_pippy import get_pipeline_driver
+from ts.torch_handler.distributed.base_pippy_handler import BasePippyHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+if packaging.version.parse(torch.__version__) >= packaging.version.parse("2.0.0"):
+    logger.info("PyTorch version is 2.0.0 or greater")
+else:
+    logger.info(
+        "PyTorch version is less than 2.0.0, initializing with meta device needs PyTorch 2.0.0 and greater"
+    )
+
+
+class TransformersSeqClassifierHandler(BasePippyHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(TransformersSeqClassifierHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        """In this initialize function, the HF large model is loaded and
+        partitioned into multiple stages each on one device using PiPPy.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artefacts parameters.
+        """
+        super().initialize(ctx)
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+        self.device = self.local_rank
+
+        model_path = ctx.model_yaml_config["handler"]["model_path"]
+        seed = ctx.model_yaml_config["handler"]["manual_seed"]
+        dtype_str = ctx.model_yaml_config["handler"]["dtype"]
+        torch.manual_seed(seed)
+
+        dtypes = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+
+        dtype = dtypes.get(dtype_str, torch.float32)
+        if dtype != torch.float32 and dtype_str not in dtypes:
+            logger.info(
+                f"Unsupported data type {dtype_str}, "
+                "please submit a PR to support it. Falling back to fp32 now."
+            )
+
+        skip_init_start = time.perf_counter()
+        with torch.device("meta"):
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path, use_cache=False, torch_dtype=dtype
+            )
+        skip_init_end = time.perf_counter()
+        logger.info(
+            f" init model time on meta device took {skip_init_end - skip_init_start} seconds"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, return_tensors="pt")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.max_length = ctx.model_yaml_config["handler"]["max_length"]
+        self.max_new_tokens = ctx.model_yaml_config["handler"]["max_new_tokens"]
+
+        logger.info("Instantiating model Pipeline")
+        pippy_compile_time_start = time.perf_counter()
+        self.model = get_pipeline_driver(self.model, self.world_size, ctx)
+        pippy_compile_time_end = time.perf_counter()
+
+        logger.info(
+            f" pippy compile time took {pippy_compile_time_end- pippy_compile_time_start} seconds on rank {self.local_rank}"
+        )
+
+        logger.info("Transformer model from path %s loaded successfully", model_dir)
+
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch = []
+        for input_text in input_texts:
+            input_ids = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.device)
+        return input_ids_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            pad_to_max_length=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = inputs["input_ids"]
+        return input_ids
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            max_length=self.max_new_tokens,
+        )
+        generated_text = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+        logger.info("Generated text: %s", generated_text)
+        return generated_text
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output
diff --git a/examples/large_models/Huggingface_pippy/requirements.txt b/examples/large_models/Huggingface_pippy/requirements.txt
new file mode 100644
index 0000000000..32d9296363
--- /dev/null
+++ b/examples/large_models/Huggingface_pippy/requirements.txt
@@ -0,0 +1,2 @@
+transformers
+torchpippy
diff --git a/examples/large_models/Huggingface_pippy/sample_text.txt b/examples/large_models/Huggingface_pippy/sample_text.txt
new file mode 100644
index 0000000000..d5c3fdae71
--- /dev/null
+++ b/examples/large_models/Huggingface_pippy/sample_text.txt
@@ -0,0 +1 @@
+Hey, are you conscious? Can you talk to me?
diff --git a/examples/large_models/deepspeed/opt/Readme.md b/examples/large_models/deepspeed/opt/Readme.md
new file mode 100644
index 0000000000..cb8c06a976
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/Readme.md
@@ -0,0 +1,94 @@
+# Loading large Huggingface models on Multiple GPUs
+
+This document briefs on serving large HG models on multiple GPUs using deepspeed. To speed up TorchServe regression test, facebook/opt-350m is used in this example. User can choose larger model such as facebook/opt-6.7b.
+
+## Option 1: Using model_dir
+
+### Step 1: Download model
+
+```bash
+python ../../utils/Download_model.py --model_path model --model_name facebook/opt-350m --revision main
+```
+
+The script prints the path where the model is downloaded as below.
+
+`model/models--facebook--opt-350m/snapshots/cb32f77e905cccbca1d970436fb0f5e6b58ee3c5/`
+
+### Step 2: Generate mar or tgz file
+
+```bash
+torch-model-archiver --model-name opt --version 1.0 --handler custom_handler.py --extra-files model/models--facebook--opt-350m/snapshots/cb32f77e905cccbca1d970436fb0f5e6b58ee3c5/,ds-config.json -r requirements.txt --config-file model-config.yaml --archive-format tgz
+```
+
+### Step 3: Add the tgz file to model store
+
+```bash
+mkdir model_store
+mv opt.tar.gz model_store
+```
+
+### Step 4: Start torchserve
+
+```bash
+torchserve --start --ncs --model-store model_store --models opt.tar.gz
+```
+
+### Step 5: Run inference
+
+```bash
+curl -v "http://localhost:8080/predictions/opt" -T sample_text.txt
+```
+
+## Option 2: Using model name
+
+### Step 1: Update initialize in custom_handler.py
+```python
+    def initialize(self, ctx: Context):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artefacts parameters.
+        """
+        super().initialize(ctx)
+        model_dir = ctx.system_properties.get("model_dir")
+        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="left")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
+        self.model.eval()
+
+        ds_engine = get_ds_engine(self.model, ctx)
+        self.model = ds_engine.module
+        logger.info("Model %s loaded successfully", ctx.model_name)
+        self.initialized = True
+```
+
+### Step 2: Generate mar or tgz file
+
+```bash
+torch-model-archiver --model-name opt --version 1.0 --handler custom_handler.py --extra-files ds-config.json -r requirements.txt --config-file model-config.yaml
+```
+
+### Step 3: Add the mar file to model store
+
+```bash
+mkdir model_store
+mv opt.mar model_store
+```
+
+### Step 4: Start torchserve
+
+```bash
+torchserve --start --ncs --model-store model_store --models opt.mar
+```
+
+### Step 5: Run inference
+
+```bash
+curl -v "http://localhost:8080/predictions/opt" -T sample_text.txt
+```
diff --git a/examples/large_models/deepspeed/opt/custom_handler.py b/examples/large_models/deepspeed/opt/custom_handler.py
new file mode 100644
index 0000000000..f129de0093
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/custom_handler.py
@@ -0,0 +1,129 @@
+import logging
+from abc import ABC
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.context import Context
+from ts.handler_utils.distributed.deepspeed import get_ds_engine
+from ts.torch_handler.distributed.base_deepspeed_handler import BaseDeepSpeedHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+
+
+class TransformersSeqClassifierHandler(BaseDeepSpeedHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(TransformersSeqClassifierHandler, self).__init__()
+        self.max_length = None
+        self.max_new_tokens = None
+        self.tokenizer = None
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artefacts parameters.
+        """
+        super().initialize(ctx)
+        model_dir = ctx.system_properties.get("model_dir")
+        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="left")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_dir, torch_dtype=torch.float16
+        )
+        self.model.eval()
+
+        ds_engine = get_ds_engine(self.model, ctx)
+        self.model = ds_engine.module
+        logger.info("Model %s loaded successfully", ctx.model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch, attention_mask_batch = [], []
+        for input_text in input_texts:
+            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+            attention_mask_batch.append(attention_mask)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.device)
+        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
+        return input_ids_batch, attention_mask_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            padding=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+            truncation=True,
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            attention_mask=attention_mask_batch,
+            max_length=self.max_new_tokens,
+        )
+
+        inferences = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output
diff --git a/examples/large_models/deepspeed/opt/ds-config.json b/examples/large_models/deepspeed/opt/ds-config.json
new file mode 100644
index 0000000000..129ed04bbc
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/ds-config.json
@@ -0,0 +1,7 @@
+{
+  "dtype": "torch.float16",
+  "replace_with_kernel_inject": true,
+  "tensor_parallel": {
+    "tp_size": 2
+  }
+}
\ No newline at end of file
diff --git a/examples/large_models/deepspeed/opt/model-config.yaml b/examples/large_models/deepspeed/opt/model-config.yaml
new file mode 100644
index 0000000000..3f8fd61bdd
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/model-config.yaml
@@ -0,0 +1,21 @@
+# TorchServe frontend parameters
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 1200
+parallelType: "tp"
+deviceType: "gpu"
+# example of user specified GPU deviceIds
+deviceIds: [2,3] # seting CUDA_VISIBLE_DEVICES
+
+torchrun:
+    nproc-per-node: 2
+
+# TorchServe Backend parameters
+deepspeed:
+    config: ds-config.json
+
+handler:
+    max_length: 50
+    max_new_tokens: 10
+    manual_seed: 40
diff --git a/examples/large_models/deepspeed/opt/requirements.txt b/examples/large_models/deepspeed/opt/requirements.txt
new file mode 100644
index 0000000000..149ef947b4
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.28.1
+deepspeed==0.9.1
diff --git a/examples/large_models/deepspeed/opt/sample_text.txt b/examples/large_models/deepspeed/opt/sample_text.txt
new file mode 100644
index 0000000000..5a17312cbb
--- /dev/null
+++ b/examples/large_models/deepspeed/opt/sample_text.txt
@@ -0,0 +1 @@
+Today the weather is really nice and I am planning on
diff --git a/examples/deepspeed_mii/DeepSpeed_mii_handler.py b/examples/large_models/deepspeed_mii/DeepSpeed_mii_handler.py
similarity index 100%
rename from examples/deepspeed_mii/DeepSpeed_mii_handler.py
rename to examples/large_models/deepspeed_mii/DeepSpeed_mii_handler.py
diff --git a/examples/deepspeed_mii/Download_deepseed_mii_models.py b/examples/large_models/deepspeed_mii/Download_deepseed_mii_models.py
similarity index 100%
rename from examples/deepspeed_mii/Download_deepseed_mii_models.py
rename to examples/large_models/deepspeed_mii/Download_deepseed_mii_models.py
diff --git a/examples/deepspeed_mii/Readme.md b/examples/large_models/deepspeed_mii/Readme.md
similarity index 100%
rename from examples/deepspeed_mii/Readme.md
rename to examples/large_models/deepspeed_mii/Readme.md
diff --git a/examples/deepspeed_mii/config.properties b/examples/large_models/deepspeed_mii/config.properties
similarity index 100%
rename from examples/deepspeed_mii/config.properties
rename to examples/large_models/deepspeed_mii/config.properties
diff --git a/examples/deepspeed_mii/deepspeed_mii_stable_diffusion.py b/examples/large_models/deepspeed_mii/deepspeed_mii_stable_diffusion.py
similarity index 100%
rename from examples/deepspeed_mii/deepspeed_mii_stable_diffusion.py
rename to examples/large_models/deepspeed_mii/deepspeed_mii_stable_diffusion.py
diff --git a/examples/deepspeed_mii/query.py b/examples/large_models/deepspeed_mii/query.py
similarity index 100%
rename from examples/deepspeed_mii/query.py
rename to examples/large_models/deepspeed_mii/query.py
diff --git a/examples/deepspeed_mii/requirements.txt b/examples/large_models/deepspeed_mii/requirements.txt
similarity index 100%
rename from examples/deepspeed_mii/requirements.txt
rename to examples/large_models/deepspeed_mii/requirements.txt
diff --git a/examples/deepspeed_mii/setup_config.json b/examples/large_models/deepspeed_mii/setup_config.json
similarity index 100%
rename from examples/deepspeed_mii/setup_config.json
rename to examples/large_models/deepspeed_mii/setup_config.json
diff --git a/examples/large_models/utils/Download_model.py b/examples/large_models/utils/Download_model.py
new file mode 100644
index 0000000000..2e8f6c9579
--- /dev/null
+++ b/examples/large_models/utils/Download_model.py
@@ -0,0 +1,54 @@
+import argparse
+import os
+
+from huggingface_hub import HfApi, snapshot_download
+
+
+def dir_path(path_str):
+    if os.path.isdir(path_str):
+        return path_str
+    elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
+        os.makedirs(path_str)
+        return path_str
+    else:
+        raise NotADirectoryError(path_str)
+
+
+class HFModelNotFoundError(Exception):
+    def __init__(self, model_str):
+        super().__init__(f"HuggingFace model not found: '{model_str}'")
+
+
+def hf_model(model_str):
+    api = HfApi()
+    models = [m.modelId for m in api.list_models()]
+    if model_str in models:
+        return model_str
+    else:
+        raise HFModelNotFoundError(model_str)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_path",
+    "-o",
+    type=dir_path,
+    default="model",
+    help="Output directory for downloaded model files",
+)
+parser.add_argument(
+    "--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name"
+)
+parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
+args = parser.parse_args()
+# Only download pytorch checkpoint files
+allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]
+
+snapshot_path = snapshot_download(
+    repo_id=args.model_name,
+    revision=args.revision,
+    allow_patterns=allow_patterns,
+    cache_dir=args.model_path,
+    use_auth_token=False,
+)
+print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
diff --git a/examples/micro_batching/README.md b/examples/micro_batching/README.md
new file mode 100644
index 0000000000..373a708520
--- /dev/null
+++ b/examples/micro_batching/README.md
@@ -0,0 +1,111 @@
+# Micro Batching
+Accelerators like GPUs can be used most cost efficiently for inference if they are steadily fed with incoming data.
+TorchServe currently allows a single batch to be processed per backend worker.
+In each worker the three computation steps (preprocess, inference, postprocess) are executed sequentially.
+Because pre- and postprocessing are often carried out on the CPU the GPU sits idle until the two CPU bound steps are executed and the worker receives a new batch.
+The following example will show how to make better use of an accelerator in high load scenarios.
+
+For this we are going to assume that there are a lot of incoming client requests and we can potentially fill a bigger batch size within the batch delay time frame where the frontend collects requests for the next batch.
+Given this precondition we are going to increase the batch size which the backend worker receives and subsequently split the big batch up into smaller *micro* batches to perform the processing.
+We can then perform the computation on the micro batches in parallel as more than one batches are available to the worker.
+This way we can already process a micro batch on the GPU while the preprocessing is applied to remaining micro batches.
+The pros and cons of this approach are as follow:
+
+Pros:
+
+*  Higher throughput by better utilizing the available accelerator
+*  Lower overall latency when enough requests are available for computation
+
+Cons:
+
+* Potentially higher latency and throughput if not enough requests are available
+
+## Implementation
+This example implements micro batching using a custom handler which overwrites the *handle* method with a MicroBatching object defined in __ts.utils.micro_batching__.
+```python
+class MicroBatchingHandler(ImageClassifier):
+    def __init__(self):
+        mb_handle = MicroBatching(self)
+        self.handle = mb_handle
+```
+The MicroBatching object takes the custom handler as an input and spins up a number of threads.
+Each thread will work on one of the processing steps (preprocess, inference, postprocess) of the custom handler while multiple threads can be assigned to process the same step in parallel.
+The number of threads as well as the size of the micro batch size is configurable through the [model yaml config](config.yaml):
+```yaml
+batchSize: 32
+micro_batching:
+  micro_batch_size: 4
+  parallelism:
+    preprocess: 2
+    inference: 1
+    postprocess: 2
+```
+Each number in the *parallelism* dictionary represents the number of threads created for the respective step on initialization.
+The micro_batch_size parameter should be chosen much smaller than the batch size configured through the TorchServe API (e.g. 64 vs 4)
+
+## Example
+The following example will take a ResNet18 image classification model and run the pre- and postprocessing in parallel which includes resizing and cropping the image.
+
+First, we need to download the model weights:
+```bash
+$ cd <TorchServe main folder>
+$ wget https://download.pytorch.org/models/resnet18-f37072fd.pth
+```
+Second, we create the MAR file while including the necessary source and config files as additional files:
+```bash
+$ torch-model-archiver --model-name resnet-18_mb --version 1.0 --model-file ./examples/image_classifier/resnet_18/model.py --serialized-file resnet18-f37072fd.pth --handler examples/micro_batching/micro_batching_handler.py --extra-files ./examples/image_classifier/index_to_name.json --config-file examples/micro_batching/config.yaml
+```
+Our MicroBatchingHandler defined in [micro_batching_handler.py](micro_batching_handler.py) inherits from ImageClassifier which already defines the necessary pre- and postprocessing.
+
+Third, we move the MAR file to our model_store and start TorchServe.
+```bash
+$ mkdir model_store
+$ mv resnet-18_mb.mar model_store/
+$ torchserve --start --ncs --model-store model_store --models resnet-18_mb.mar
+```
+
+Finally, we test the registered model with a request:
+```bash
+$ curl http://127.0.0.1:8080/predictions/resnet-18_mb -T ./examples/image_classifier/kitten.jpg
+```
+In the next section we will have a look at how the throughput and latency of the model behave by benchmarking it with TorchServe's benchmark tool.
+
+## Results
+For the following benchmark we use [benchmark-ab.py](../../benchmarks/benchmark-ab.py) and a ResNet50 instead of the smaller ResNet18.
+We ran this benchmark on an AWS g4dn.4xlarge instance with a single T4 GPU.
+After creating the MAR file as described above we extract it into the model_store so we do not need to upload the file.
+```bash
+$ unzip -d model_store/resnet-50_mb model_store/resnet-50_mb.mar
+```
+Subsequently, we can run the benchmark with:
+```bash
+$ python3 benchmarks/benchmark-ab.py --config benchmarks/config.json
+```
+The config.json for the benchmark has the following content:
+```json
+{
+    "url":"/home/ubuntu/serve/model_store/resnet-50_mb/",
+    "requests": 50000,
+    "concurrency": 200,
+    "input": "/home/ubuntu/serve/examples/image_classifier/kitten.jpg",
+    "workers": "1",
+    "batch_size": 64
+}
+```
+This will run the model with a batch size of 64 and a micro batch size of 4 as configured in the config.yaml.
+For this section we ran the benchmark with different batch sizes and micro batch sized (marked with "MBS=X") as well as different number of threads to create the following diagrams.
+As a baseline we also ran the vanilla ImageClassifier handler without micro batching which is marked as "NO MB".
+![](assets/throughput_latency.png)
+In the diagrams we see the throughput and P99 latency plotted over the batch size (as configured through TorchServe API).
+Each curve represents a different micro batch size as configured through [config.yaml](config.yaml).
+We can see that the throughput stays flat for the vanilla ImageClassifier (NO MB) which suggests that the inference is preprocessing bound and the GPU is underutilized which can be confirmed with a look at the nvidia-smi output.
+By interleaving the three compute steps and using two threads for pre- and postprocessing we see that the micro batched variants (MBS=4-16) achieve a higher throughput and even a lower batch latency as the GPU is better utilized due to the introduction of micro batches.
+For this particular model we can achieve a throughput of up to 250 QPS by increasing the number of preprocessing threads to 4 and choosing 128 and 8 as batch size and micro batch size, respectively.
+The actual achieved speedup will depend on the specific model as well as the intensity of the pre- and postprocessing steps.
+Image scaling and decompression for example is usually more compute intense than text preprocessing.
+
+## Summary
+In summary we can see that micro batching can help to increase the throughput of a model while decreasing its latency.
+This is especially true for workloads with compute intense pre- or postprocessing as well as smaller models.
+The micro batching approach can also be used to save memory in a CPU use case by scaling the number if inference threads to >1 which allows to run multiple instances of the model which all share the same underlying weights.
+This is in contrast to running multiple TorchServe worker which each create their own model instance which can not share their weights as they reside in different processes.
diff --git a/examples/micro_batching/assets/throughput_latency.png b/examples/micro_batching/assets/throughput_latency.png
new file mode 100644
index 0000000000..8c00c575da
Binary files /dev/null and b/examples/micro_batching/assets/throughput_latency.png differ
diff --git a/examples/micro_batching/config.yaml b/examples/micro_batching/config.yaml
new file mode 100644
index 0000000000..e97e58cc60
--- /dev/null
+++ b/examples/micro_batching/config.yaml
@@ -0,0 +1,8 @@
+batchSize: 32
+
+micro_batching:
+  micro_batch_size: 4
+  parallelism:
+    preprocess: 2
+    inference: 1
+    postprocess: 2
diff --git a/examples/micro_batching/micro_batching_handler.py b/examples/micro_batching/micro_batching_handler.py
new file mode 100644
index 0000000000..bef34ed513
--- /dev/null
+++ b/examples/micro_batching/micro_batching_handler.py
@@ -0,0 +1,30 @@
+import logging
+
+from ts.handler_utils.micro_batching import MicroBatching
+from ts.torch_handler.image_classifier import ImageClassifier
+
+logger = logging.getLogger(__name__)
+
+
+class MicroBatchingHandler(ImageClassifier):
+    def __init__(self):
+        mb_handle = MicroBatching(self)
+        self.handle = mb_handle
+
+    def initialize(self, ctx):
+        super().initialize(ctx)
+
+        parallelism = ctx.model_yaml_config.get("micro_batching", {}).get(
+            "parallelism", None
+        )
+        if parallelism:
+            logger.info(
+                f"Setting micro batching parallelism  from model_config_yaml: {parallelism}"
+            )
+            self.handle.parallelism = parallelism
+
+        micro_batch_size = ctx.model_yaml_config.get("micro_batching", {}).get(
+            "micro_batch_size", 1
+        )
+        logger.info(f"Setting micro batching size: {micro_batch_size}")
+        self.handle.micro_batch_size = micro_batch_size
diff --git a/examples/nvidia_dali/README.md b/examples/nvidia_dali/README.md
new file mode 100644
index 0000000000..c660ff9e4c
--- /dev/null
+++ b/examples/nvidia_dali/README.md
@@ -0,0 +1,88 @@
+# DALI Optimization integration with Torchserve models
+
+The NVIDIA Data Loading Library (DALI) is a library for data loading and pre-processing to accelerate deep learning applications. It provides a collection of highly optimized building blocks for loading and processing image, video and audio data.
+
+In this example, we use NVIDIA DALI for pre-processing image input for inference in resnet-18 model.
+
+Refer to [NVIDIA-DALI-Documentation](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/index.html) for detailed information
+
+### Install dependencies
+
+Navigate to `serve/examples/nvidia_dali` directory and run the below command to install the dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+### Define and Build DALI Pipeline
+
+In DALI, any data processing task has a central object called Pipeline.
+Refer to [NVIDIA-DALI](https://github.com/NVIDIA/DALI) for more details on DALI pipeline.
+
+Navigate to `cd ./serve/examples/nvidia_dali`.
+
+Change the `dali_config.json` variables
+
+`batch_size` - Maximum batch size of pipeline.
+
+`num_threads` - Number of CPU threads used by the pipeline.
+
+`device_id` - ID of GPU device used by pipeline.
+
+Run the python file which serializes the Dali Pipeline and saves it to `model.dali`
+
+```bash
+python serialize_dali_pipeline.py --config dali_config.json
+```
+
+**__Note__**:
+
+- Make sure that the serialized file has the extension `.dali`
+- The Torchserve batch size should match the DALI batch size.
+
+### Download the resnet .pth file
+
+```bash
+wget https://download.pytorch.org/models/resnet18-f37072fd.pth
+```
+
+### Create model-archive file
+
+The following command will create a .mar extension file where we also include the `model.dali` file and `dali_config.json` file in it.
+
+```bash
+torch-model-archiver --model-name resnet-18 --version 1.0 --model-file ../image_classifier/resnet_18/model.py --serialized-file resnet18-f37072fd.pth --handler custom_handler.py --extra-files ../image_classifier/index_to_name.json,./model.dali,./dali_config.json
+```
+
+Navigate to `serve` directory and run the below commands
+
+Create a new directory `model_store` and move the model-archive file
+
+```bash
+mkdir model_store
+mv resnet-18.mar model_store/
+```
+
+### Start the torchserve
+
+```bash
+torchserve --start --model-store model_store --models resnet-18=resnet-18.mar
+```
+
+### Run Inference
+
+Get the inference for a sample image using the below command
+
+```bash
+curl http://127.0.0.1:8080/predictions/resnet-18 -T ./examples/image_classifier/kitten.jpg
+```
+
+```json
+{
+  "tabby": 0.408751517534256,
+  "tiger_cat": 0.35404905676841736,
+  "Egyptian_cat": 0.12418942898511887,
+  "lynx": 0.025347290560603142,
+  "bucket": 0.011393273249268532
+}
+```
diff --git a/examples/nvidia_dali/custom_handler.py b/examples/nvidia_dali/custom_handler.py
new file mode 100644
index 0000000000..bf3337abb0
--- /dev/null
+++ b/examples/nvidia_dali/custom_handler.py
@@ -0,0 +1,77 @@
+# pylint: disable=W0223
+# Details : https://github.com/PyCQA/pylint/issues/3098
+"""
+Base module for all vision handlers
+"""
+import json
+import os
+
+import numpy as np
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.pytorch import DALIGenericIterator, LastBatchPolicy
+
+from ts.torch_handler.image_classifier import ImageClassifier
+
+
+class DALIHandler(ImageClassifier):
+    """
+    Base class for all vision handlers
+    """
+
+    def __init__(self):
+        super(DALIHandler, self).__init__()
+
+    def initialize(self, context):
+        super().initialize(context)
+        properties = context.system_properties
+        self.model_dir = properties.get("model_dir")
+
+        self.dali_file = [
+            file for file in os.listdir(self.model_dir) if file.endswith(".dali")
+        ]
+        if not len(self.dali_file):
+            raise RuntimeError("Missing dali pipeline file.")
+        self.PREFETCH_QUEUE_DEPTH = 2
+        dali_config_file = os.path.join(self.model_dir, "dali_config.json")
+        if not os.path.isfile(dali_config_file):
+            raise RuntimeError("Missing dali_config.json file.")
+        with open(dali_config_file) as setup_config_file:
+            self.dali_configs = json.load(setup_config_file)
+        filename = os.path.join(self.model_dir, self.dali_file[0])
+        self.pipe = Pipeline.deserialize(filename=filename)
+        # pylint: disable=protected-access
+        self.pipe._max_batch_size = self.dali_configs["batch_size"]
+        self.pipe._num_threads = self.dali_configs["num_threads"]
+        self.pipe._device_id = self.dali_configs["device_id"]
+
+    def preprocess(self, data):
+        """The preprocess function of MNIST program converts the input data to a float tensor
+
+        Args:
+            data (List): Input data from the request is in the form of a Tensor
+
+        Returns:
+            list : The preprocess function returns the input image as a list of float tensors.
+        """
+        batch_tensor = []
+
+        input_byte_arrays = [i["body"] if "body" in i else i["data"] for i in data]
+        for byte_array in input_byte_arrays:
+            np_image = np.frombuffer(byte_array, dtype=np.uint8)
+            batch_tensor.append(np_image)  # we can use numpy
+
+        for _ in range(self.PREFETCH_QUEUE_DEPTH):
+            self.pipe.feed_input("my_source", batch_tensor)
+
+        datam = DALIGenericIterator(
+            [self.pipe],
+            ["data"],
+            last_batch_policy=LastBatchPolicy.PARTIAL,
+            last_batch_padded=True,
+        )
+        result = []
+        for _, data in enumerate(datam):
+            result.append(data[0]["data"])
+            break
+
+        return result[0].to(self.device)
diff --git a/examples/nvidia_dali/dali_config.json b/examples/nvidia_dali/dali_config.json
new file mode 100644
index 0000000000..63611f804f
--- /dev/null
+++ b/examples/nvidia_dali/dali_config.json
@@ -0,0 +1,5 @@
+{
+  "batch_size" : 5,
+  "num_threads" : 2,
+  "device_id" : 0
+}
diff --git a/examples/nvidia_dali/index_to_name.json b/examples/nvidia_dali/index_to_name.json
new file mode 100644
index 0000000000..5fe0dfefcd
--- /dev/null
+++ b/examples/nvidia_dali/index_to_name.json
@@ -0,0 +1 @@
+{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
\ No newline at end of file
diff --git a/examples/nvidia_dali/model.py b/examples/nvidia_dali/model.py
new file mode 100644
index 0000000000..203db57552
--- /dev/null
+++ b/examples/nvidia_dali/model.py
@@ -0,0 +1,6 @@
+from torchvision.models.resnet import BasicBlock, ResNet
+
+
+class ImageClassifier(ResNet):
+    def __init__(self):
+        super(ImageClassifier, self).__init__(BasicBlock, [2, 2, 2, 2])
diff --git a/examples/nvidia_dali/requirements.txt b/examples/nvidia_dali/requirements.txt
new file mode 100644
index 0000000000..dd79b8db37
--- /dev/null
+++ b/examples/nvidia_dali/requirements.txt
@@ -0,0 +1,2 @@
+nvidia-dali-cuda110==1.18.0
+--extra-index-url https://developer.download.nvidia.com/compute/redist
diff --git a/examples/nvidia_dali/serialize_dali_pipeline.py b/examples/nvidia_dali/serialize_dali_pipeline.py
new file mode 100644
index 0000000000..3f01d53723
--- /dev/null
+++ b/examples/nvidia_dali/serialize_dali_pipeline.py
@@ -0,0 +1,55 @@
+import json
+import os
+
+import nvidia.dali as dali
+import nvidia.dali.types as types
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save", default="./model.dali")
+    parser.add_argument("--config", default="dali_config.json")
+    return parser.parse_args()
+
+
+@dali.pipeline_def
+def pipe():
+    jpegs = dali.fn.external_source(dtype=types.UINT8, name="my_source")
+    decoded = dali.fn.decoders.image(jpegs, device="mixed")
+    resized = dali.fn.resize(
+        decoded,
+        size=[256],
+        subpixel_scale=False,
+        interp_type=types.DALIInterpType.INTERP_LINEAR,
+        antialias=True,
+        mode="not_smaller",
+    )
+    normalized = dali.fn.crop_mirror_normalize(
+        resized,
+        crop_pos_x=0.5,
+        crop_pos_y=0.5,
+        crop=(224, 224),
+        mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+        std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+    )
+    return normalized
+
+
+def main(filename):
+    with open(args.config) as fp:
+        config = json.load(fp)
+    batch_size = config["batch_size"]
+    num_threads = config["num_threads"]
+    device_id = config["device_id"]
+
+    pipe1 = pipe(batch_size=batch_size, num_threads=num_threads, device_id=device_id)
+    pipe1.serialize(filename=filename)
+    print("Saved {}".format(filename))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    os.makedirs(os.path.dirname(args.save), exist_ok=True)
+    main(args.save)
diff --git a/examples/pt2/README.md b/examples/pt2/README.md
index 1a6be197f0..661a8362ed 100644
--- a/examples/pt2/README.md
+++ b/examples/pt2/README.md
@@ -1,29 +1,29 @@
 ## PyTorch 2.x integration
 
-PyTorch 2.0 brings more compiler options to PyTorch, for you that should mean better perf either in the form of lower latency or lower memory consumption. Integrating PyTorch 2.0 is fairly trivial but for now the support will be experimental until the official release and while we are relying on the nightly builds.
+PyTorch 2.0 brings more compiler options to PyTorch, for you that should mean better perf either in the form of lower latency or lower memory consumption. Integrating PyTorch 2.0 is fairly trivial but for now the support will be experimental given that most public benchmarks have focused on training instead of inference.
 
 We strongly recommend you leverage newer hardware so for GPUs that would be an Ampere architecture. You'll get even more benefits from using server GPU deployments like A10G and A100 vs consumer cards. But you should expect to see some speedups for any Volta or Ampere architecture.
 
 ## Get started
 
-Install torchserve with nightly torch binaries
+Install torchserve and ensure that you're using at least `torch>=2.0.0`
 
-```
-python ts_scripts/install_dependencies.py --cuda=cu117 --nightly_torch
+```sh
+python ts_scripts/install_dependencies.py --cuda=cu117
 pip install torchserve torch-model-archiver
 ```
 
 ## Package your model
 
-PyTorch 2.0 supports several compiler backends and you pick which one you want by passing in an optional file `compile.json` during your model packaging
+PyTorch 2.0 supports several compiler backends and you pick which one you want by passing in an optional file `model_config.yaml` during your model packaging
 
-`{"pt2" : "inductor"}`
+`pt2: "inductor"`
 
-As an example let's expand our getting started guide with the only difference being passing in the extra `compile.json` file
+As an example let's expand our getting started guide with the only difference being passing in the extra `model_config.yaml` file
 
 ```
 mkdir model_store
-torch-model-archiver --model-name densenet161 --version 1.0 --model-file ./serve/examples/image_classifier/densenet_161/model.py --export-path model_store --extra-files ./serve/examples/image_classifier/index_to_name.json,./serve/examples/image_classifier/compile.json --handler image_classifier
+torch-model-archiver --model-name densenet161 --version 1.0 --model-file ./serve/examples/image_classifier/densenet_161/model.py --export-path model_store --extra-files ./serve/examples/image_classifier/index_to_name.json --handler image_classifier --config-file model_config.yaml
 torchserve --start --ncs --model-store model_store --models densenet161.mar
 ```
 
@@ -35,7 +35,7 @@ opt_mod = torch.compile(mod)
 # 2. Train the optimized module
 # ....
 # 3. Save the original module (weights are shared)
-torch.save(model, "model.pt")  
+torch.save(model, "model.pt")
 
 # 4. Load the non optimized model
 mod = torch.load(model)
diff --git a/examples/text_classification_with_scriptable_tokenizer/handler.py b/examples/text_classification_with_scriptable_tokenizer/handler.py
index d70c67f4be..8081f59ba9 100644
--- a/examples/text_classification_with_scriptable_tokenizer/handler.py
+++ b/examples/text_classification_with_scriptable_tokenizer/handler.py
@@ -1,6 +1,5 @@
 """
 Module for text classification with scriptable tokenizer
-DOES NOT SUPPORT BATCH!
 """
 import logging
 from abc import ABC
@@ -51,18 +50,19 @@ def preprocess(self, data):
 
         # Compat layer: normally the envelope should just return the data
         # directly, but older versions of Torchserve didn't have envelope.
-        # Processing only the first input, not handling batch inference
 
-        line = data[0]
-        text = line.get("data") or line.get("body")
-        # Decode text if not a str but bytes or bytearray
-        if isinstance(text, (bytes, bytearray)):
-            text = text.decode("utf-8")
+        text_batch = []
+        for line in data:
+            text = line.get("data") or line.get("body")
+            # Decode text if not a str but bytes or bytearray
+            if isinstance(text, (bytes, bytearray)):
+                text = text.decode("utf-8")
 
-        text = remove_html_tags(text)
-        text = text.lower()
+            text = remove_html_tags(text)
+            text = text.lower()
+            text_batch.append(text)
 
-        return text
+        return text_batch
 
     def inference(self, data, *args, **kwargs):
         """The Inference Request is made through this function and the user
diff --git a/examples/text_classification_with_scriptable_tokenizer/script_tokenizer_and_model.py b/examples/text_classification_with_scriptable_tokenizer/script_tokenizer_and_model.py
index a51297caca..873c0264ee 100644
--- a/examples/text_classification_with_scriptable_tokenizer/script_tokenizer_and_model.py
+++ b/examples/text_classification_with_scriptable_tokenizer/script_tokenizer_and_model.py
@@ -76,7 +76,7 @@ def main(args):
     model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
 
     # Load trained parameters and load them into the model
-    model.load_state_dict(torch.load(args.input_file))
+    model.load_state_dict(torch.load(args.input_file, map_location=torch.device("cpu")))
 
     # Chain the tokenizer, the adapter and the model
     combi_model = T.Sequential(
@@ -88,7 +88,7 @@ def main(args):
     combi_model.eval()
 
     # Make sure to move the model to CPU to avoid placement error during loading
-    combi_model.to("cpu")
+    combi_model.to(torch.device("cpu"))
 
     combi_model_jit = torch.jit.script(combi_model)
 
diff --git a/frontend/archive/build.gradle b/frontend/archive/build.gradle
index cce015aa26..412b276bbf 100644
--- a/frontend/archive/build.gradle
+++ b/frontend/archive/build.gradle
@@ -1,8 +1,10 @@
-dependencies {
+def var = dependencies {
     api "commons-io:commons-io:2.6"
     api "org.slf4j:slf4j-api:${slf4j_api_version}"
     api "org.apache.logging.log4j:log4j-slf4j-impl:${slf4j_log4j_version}"
     api "com.google.code.gson:gson:${gson_version}"
+    implementation "org.yaml:snakeyaml:${snakeyaml_version}"
+    implementation 'org.apache.commons:commons-compress:1.23.0'
 
     testImplementation "commons-cli:commons-cli:${commons_cli_version}"
     testImplementation "org.testng:testng:${testng_version}"
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/Manifest.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/Manifest.java
index 18c44a2d9c..9764dd78d3 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/Manifest.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/Manifest.java
@@ -64,6 +64,7 @@ public static final class Model {
         private String handler;
         private String envelope;
         private String requirementsFile;
+        private String configFile;
 
         public Model() {}
 
@@ -122,6 +123,14 @@ public String getEnvelope() {
         public void setEnvelope(String envelope) {
             this.envelope = envelope;
         }
+
+        public String getConfigFile() {
+            return configFile;
+        }
+
+        public void setConfigFile(String configFile) {
+            this.configFile = configFile;
+        }
     }
 
     public enum RuntimeType {
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelArchive.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelArchive.java
index 134a931f81..47b81458ae 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelArchive.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelArchive.java
@@ -6,6 +6,7 @@
 import java.nio.file.FileAlreadyExistsException;
 import java.nio.file.Files;
 import java.util.List;
+import java.util.Map;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.pytorch.serve.archive.DownloadArchiveException;
@@ -25,12 +26,14 @@ public class ModelArchive {
     private String url;
     private File modelDir;
     private boolean extracted;
+    private ModelConfig modelConfig;
 
     public ModelArchive(Manifest manifest, String url, File modelDir, boolean extracted) {
         this.manifest = manifest;
         this.url = url;
         this.modelDir = modelDir;
         this.extracted = extracted;
+        this.modelConfig = null;
     }
 
     public static ModelArchive downloadModel(
@@ -67,7 +70,12 @@ public static ModelArchive downloadModel(
 
         if (modelLocation.isFile()) {
             try (InputStream is = Files.newInputStream(modelLocation.toPath())) {
-                File unzipDir = ZipUtils.unzip(is, null, "models");
+                File unzipDir;
+                if (modelLocation.getName().endsWith(".mar")) {
+                    unzipDir = ZipUtils.unzip(is, null, "models", true);
+                } else {
+                    unzipDir = ZipUtils.unzip(is, null, "models", false);
+                }
                 return load(url, unzipDir, true);
             }
         }
@@ -92,7 +100,7 @@ private static ModelArchive load(String url, File dir, boolean extracted)
         boolean failed = true;
         try {
             File manifestFile = new File(dir, "MAR-INF/" + MANIFEST_FILE);
-            Manifest manifest = null;
+            Manifest manifest;
             if (manifestFile.exists()) {
                 manifest = ArchiveUtils.readFile(manifestFile, Manifest.class);
             } else {
@@ -179,4 +187,21 @@ public void clean() {
             FileUtils.deleteQuietly(modelDir);
         }
     }
+
+    public ModelConfig getModelConfig() {
+        if (this.modelConfig == null && manifest.getModel().getConfigFile() != null) {
+            try {
+                File configFile =
+                        new File(modelDir.getAbsolutePath(), manifest.getModel().getConfigFile());
+                Map<String, Object> modelConfigMap = ArchiveUtils.readYamlFile(configFile);
+                this.modelConfig = ModelConfig.build(modelConfigMap);
+            } catch (InvalidModelException | IOException e) {
+                logger.error(
+                        "Failed to parse model config file {}",
+                        manifest.getModel().getConfigFile(),
+                        e);
+            }
+        }
+        return this.modelConfig;
+    }
 }
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java
new file mode 100644
index 0000000000..dc72b6b085
--- /dev/null
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java
@@ -0,0 +1,569 @@
+package org.pytorch.serve.archive.model;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ModelConfig {
+    private static final Logger logger = LoggerFactory.getLogger(ModelConfig.class);
+
+    /** the minimum number of workers of a model */
+    private int minWorkers;
+    /** the maximum number of workers of a model */
+    private int maxWorkers;
+    /** the batch size of a model */
+    private int batchSize;
+    /** the maximum delay in msec of a batch of a model */
+    private int maxBatchDelay;
+    /** the timeout in sec of a specific model's response. */
+    private int responseTimeout = 120; // unit: sec
+    /**
+     * the device type where the model is loaded. It can be gpu, cpu. The model is loaded on CPU if
+     * deviceType: "cpu" is set on a GPU host.
+     */
+    private DeviceType deviceType = DeviceType.NONE;
+    /**
+     * the user specified gpu device id, By default, TorchServe auto round-robin all available GPUs
+     * to assign deviceIds to a worker of a model if deviceIds is not set.
+     */
+    private List<Integer> deviceIds;
+    /** this variable is auto calculated based on torchrun nproc-per-node. */
+    private int parallelLevel = 1;
+    /** the model parallel type can be tp, pp, pptp */
+    private ParallelType parallelType = ParallelType.NONE;
+    /** torchrun config */
+    private TorchRun torchRun;
+    /** the maximum seconds of a worker recovery's timeout. default: 5 min */
+    private int maxRetryTimeoutInSec = 300;
+    /**
+     * the client timeout in millions second. The inference request will be dropped once it is
+     * timeout. default: 0 which means no timeout (ie. clientExpireTS default value Long.MAX_VALUE.
+     */
+    private long clientTimeoutInMills;
+    /**
+     * the job queue size of a model. By default, job_queue_size is set as 100 in config.property
+     * for all models. Here, jobQueueSize: -1 means no customized setting for the model.
+     */
+    private int jobQueueSize;
+    /**
+     * the useJobTicket is a flag which allows an inference request to be accepted only if there are
+     * available workers.
+     */
+    private boolean useJobTicket;
+
+    public static ModelConfig build(Map<String, Object> yamlMap) {
+        ModelConfig modelConfig = new ModelConfig();
+        yamlMap.forEach(
+                (k, v) -> {
+                    switch (k) {
+                        case "minWorkers":
+                            if (v instanceof Integer) {
+                                modelConfig.setMinWorkers((int) v);
+                            } else {
+                                logger.warn("Invalid minWorkers: {}, should be integer", v);
+                            }
+                            break;
+                        case "maxWorkers":
+                            if (v instanceof Integer) {
+                                modelConfig.setMaxWorkers((int) v);
+                            } else {
+                                logger.warn("Invalid maxWorkers: {}, should be integer", v);
+                            }
+                            break;
+                        case "batchSize":
+                            if (v instanceof Integer) {
+                                modelConfig.setBatchSize((int) v);
+                            } else {
+                                logger.warn("Invalid batchSize: {}, should be integer", v);
+                            }
+                            break;
+                        case "maxBatchDelay":
+                            if (v instanceof Integer) {
+                                modelConfig.setMaxBatchDelay((int) v);
+                            } else {
+                                logger.warn("Invalid maxBatchDelay: {}, should be integer", v);
+                            }
+                            break;
+                        case "responseTimeout":
+                            if (v instanceof Integer) {
+                                modelConfig.setResponseTimeout((int) v);
+                            } else {
+                                logger.warn("Invalid responseTimeout: {}, should be integer", v);
+                            }
+                            break;
+                        case "deviceType":
+                            if (v instanceof String) {
+                                modelConfig.setDeviceType((String) v);
+                            } else {
+                                logger.warn("Invalid deviceType: {}, should be cpu, or gpu", v);
+                            }
+                            break;
+                        case "parallelType":
+                            if (v instanceof String) {
+                                modelConfig.setParallelMode((String) v);
+                            } else {
+                                logger.warn(
+                                        "Invalid parallelType: {}, should be pp, tp,or pptp", v);
+                            }
+                            break;
+                        case "deviceIds":
+                            if (v instanceof List<?>) {
+                                modelConfig.setDeviceIds((List<?>) v);
+                            } else {
+                                logger.warn("Invalid deviceIds: {}, should be list of integer", v);
+                            }
+                            break;
+                        case "torchrun":
+                            if (v instanceof Map<?, ?>) {
+                                modelConfig.torchRun = TorchRun.build((Map<?, ?>) v);
+                                modelConfig.setParallelLevel(
+                                        modelConfig.torchRun.getNprocPerNode());
+                            } else {
+                                logger.warn(
+                                        "Invalid torchrun: {}, should be Torchrun parameters", v);
+                            }
+                            break;
+                        case "maxRetryTimeoutInSec":
+                            if (v instanceof Integer) {
+                                modelConfig.setMaxRetryTimeoutInSec((int) v);
+                            } else {
+                                logger.warn(
+                                        "Invalid maxRetryTimeoutInMin: {}, should be integer", v);
+                            }
+                            break;
+                        case "clientTimeoutInMills":
+                            if (v instanceof Integer) {
+                                modelConfig.setClientTimeoutInMills(((Integer) v).longValue());
+                            } else {
+                                logger.warn(
+                                        "Invalid clientTimeoutInMills: {}, should be positive long",
+                                        v);
+                            }
+                            break;
+                        case "jobQueueSize":
+                            if (v instanceof Integer) {
+                                modelConfig.setJobQueueSize((int) v);
+                            } else {
+                                logger.warn("Invalid jobQueueSize: {}, should be positive int", v);
+                            }
+                            break;
+                        case "useJobTicket":
+                            if (v instanceof Boolean) {
+                                modelConfig.setUseJobTicket((boolean) v);
+                            } else {
+                                logger.warn("Invalid useJobTicket: {}, should be true or false", v);
+                            }
+                            break;
+                        default:
+                            break;
+                    }
+                });
+        return modelConfig;
+    }
+
+    public int getMinWorkers() {
+        return minWorkers;
+    }
+
+    public void setMinWorkers(int minWorkers) {
+        if (minWorkers < 0) {
+            logger.warn("Invalid minWorkers:{}", minWorkers);
+            return;
+        }
+        this.minWorkers = minWorkers;
+    }
+
+    public int getMaxWorkers() {
+        return maxWorkers;
+    }
+
+    public void setMaxWorkers(int maxWorkers) {
+        if (maxWorkers < 0) {
+            logger.warn("Invalid maxWorkers:{}", maxWorkers);
+            return;
+        }
+        this.maxWorkers = maxWorkers;
+    }
+
+    public int getBatchSize() {
+        return batchSize;
+    }
+
+    public void setBatchSize(int batchSize) {
+        if (batchSize <= 0) {
+            logger.warn("Invalid batchSize:{}", batchSize);
+            return;
+        }
+        this.batchSize = batchSize;
+    }
+
+    public int getMaxBatchDelay() {
+        return maxBatchDelay;
+    }
+
+    public void setMaxBatchDelay(int maxBatchDelay) {
+        if (maxBatchDelay < 0) {
+            logger.warn("Invalid maxBatchDelay:{}", maxBatchDelay);
+            return;
+        }
+        this.maxBatchDelay = maxBatchDelay;
+    }
+
+    public int getResponseTimeout() {
+        return responseTimeout;
+    }
+
+    public void setResponseTimeout(int responseTimeout) {
+        if (responseTimeout <= 0) {
+            logger.warn("Invalid responseTimeout:{}", responseTimeout);
+            return;
+        }
+        this.responseTimeout = responseTimeout;
+    }
+
+    public List<Integer> getDeviceIds() {
+        return deviceIds;
+    }
+
+    public void setDeviceIds(List<?> deviceIds) {
+        this.deviceIds = new ArrayList<>();
+        for (int i = 0; i < deviceIds.size(); i++) {
+            if (deviceIds.get(i) instanceof Integer) {
+                this.deviceIds.add((int) deviceIds.get(i));
+            } else {
+                logger.warn("Invalid deviceIds:{},", deviceIds.get(i));
+                this.deviceIds = null;
+                break;
+            }
+        }
+    }
+
+    public int getParallelLevel() {
+        return parallelLevel;
+    }
+
+    public void setParallelLevel(int parallelLevel) {
+        if (parallelLevel <= 0) {
+            logger.warn("Invalid parallelLevel:{}, set as 1", parallelLevel);
+            this.parallelLevel = 1;
+            return;
+        }
+        this.parallelLevel = parallelLevel;
+    }
+
+    public void setParallelMode(String parallelMode) {
+        this.parallelType = ParallelType.get(parallelMode);
+    }
+
+    public ParallelType getParallelType() {
+        return this.parallelType;
+    }
+
+    public void setDeviceType(String deviceType) {
+        this.deviceType = DeviceType.get(deviceType);
+    }
+
+    public DeviceType getDeviceType() {
+        return deviceType;
+    }
+
+    public TorchRun getTorchRun() {
+        return torchRun;
+    }
+
+    public int getMaxRetryTimeoutInSec() {
+        return maxRetryTimeoutInSec;
+    }
+
+    public void setMaxRetryTimeoutInSec(int maxRetryTimeoutInSec) {
+        if (maxRetryTimeoutInSec > 0) {
+            this.maxRetryTimeoutInSec = maxRetryTimeoutInSec;
+        }
+    }
+
+    public long getClientTimeoutInMills() {
+        return clientTimeoutInMills;
+    }
+
+    public void setClientTimeoutInMills(long clientTimeoutInMills) {
+        if (clientTimeoutInMills > 0) {
+            this.clientTimeoutInMills = clientTimeoutInMills;
+        }
+    }
+
+    public int getJobQueueSize() {
+        return jobQueueSize;
+    }
+
+    public void setJobQueueSize(int jobQueueSize) {
+        if (jobQueueSize > 0) {
+            this.jobQueueSize = jobQueueSize;
+        }
+    }
+
+    public boolean isUseJobTicket() {
+        return useJobTicket;
+    }
+
+    public void setUseJobTicket(boolean useJobTicket) {
+        this.useJobTicket = useJobTicket;
+    }
+
+    public enum ParallelType {
+        NONE(""),
+        PP("pp"),
+        TP("tp"),
+        PPTP("pptp");
+
+        private String type;
+
+        ParallelType(String type) {
+            this.type = type.toLowerCase();
+        }
+
+        public String getParallelType() {
+            return type;
+        }
+
+        public static ParallelType get(String parallelType) {
+            ParallelType pType = NONE;
+            try {
+                pType =
+                        Arrays.stream(ParallelType.values())
+                                .filter(t -> t.type.equals(parallelType.toLowerCase()))
+                                .findFirst()
+                                .get();
+            } catch (NoSuchElementException e) {
+                logger.warn("Invalid ParallelType:{}", parallelType, e);
+            }
+            return pType;
+        }
+    }
+
+    public enum DeviceType {
+        NONE(""),
+        CPU("cpu"),
+        GPU("gpu");
+
+        private String type;
+
+        DeviceType(String type) {
+            this.type = type.toLowerCase();
+        }
+
+        public String getDeviceType() {
+            return type;
+        }
+
+        public static DeviceType get(String deviceType) {
+            DeviceType dType = DeviceType.NONE;
+            try {
+                dType =
+                        Arrays.stream(DeviceType.values())
+                                .filter(t -> t.type.equals(deviceType.toLowerCase()))
+                                .findFirst()
+                                .get();
+            } catch (NoSuchElementException e) {
+                logger.warn("Invalid DeviceType:{}", deviceType, e);
+            }
+            return dType;
+        }
+    }
+
+    public static class TorchRun {
+        private int nnodes = 1;
+        private int nprocPerNode = 1;
+        private String rdzvId;
+        private String rdzvEndpoint;
+        private String rdzvBackend = "c10d";
+        private String rdzvConf;
+        private int monitorInterval = 5;
+        private int nodeRank;
+        private String masterAddr;
+        private int masterPort;
+        private int ompNumberThreads = 1;
+
+        public static TorchRun build(Map<?, ?> torchRunMap) {
+            TorchRun torchRun = new TorchRun();
+            torchRunMap.forEach(
+                    (k, v) -> {
+                        switch ((String) k) {
+                            case "nnodes":
+                                if (v instanceof Integer) {
+                                    torchRun.setNnodes((Integer) v);
+                                } else {
+                                    logger.warn("Invalid torchrun.nnodes:{}, reset to 1", v);
+                                }
+                                break;
+                            case "nproc-per-node":
+                                if (v instanceof Integer) {
+                                    torchRun.setNprocPerNode((Integer) v);
+                                } else {
+                                    logger.warn(
+                                            "Invalid torchrun.nproc-per-node:{}, reset to 1", v);
+                                }
+                                break;
+                            case "rdzv-backend":
+                                if (v instanceof String) {
+                                    torchRun.setRdzvBackend((String) v);
+                                } else {
+                                    logger.warn(
+                                            "Invalid torchrun.rdzv-backend:{}, reset to c10d", v);
+                                }
+                                break;
+                            case "rdzv-endpoint":
+                                if (v instanceof String) {
+                                    torchRun.setRdzvEndpoint((String) v);
+                                } else {
+                                    logger.warn("Invalid torchrun.rdzv-endpoint:{}", v);
+                                }
+                                break;
+                            case "rdzv-conf":
+                                if (v instanceof String) {
+                                    torchRun.setRdzvConf((String) v);
+                                } else {
+                                    logger.warn("Invalid torchrun.rdzv-conf:{}", v);
+                                }
+                                break;
+                            case "monitor-interval":
+                                if (v instanceof Integer) {
+                                    torchRun.setMonitorInterval((Integer) v);
+                                } else {
+                                    logger.warn("Invalid torchrun.max-restarts:{}, reset to 5", v);
+                                }
+                                break;
+                            case "node-rank":
+                                if (v instanceof Integer) {
+                                    torchRun.setNodeRank((Integer) v);
+                                } else {
+                                    logger.warn("Invalid torchrun.node-rank:{}, reset to 0", v);
+                                }
+                                break;
+                            case "OMP_NUMBER_THREADS":
+                                if (v instanceof Integer) {
+                                    torchRun.setOmpNumberThreads((Integer) v);
+                                } else {
+                                    logger.warn("Invalid OMP_NUMBER_THREADS:{}, reset to 1", v);
+                                }
+                                break;
+                            default:
+                                logger.warn("unsupported parameter {}", k);
+                                break;
+                        }
+                    });
+            return torchRun;
+        }
+
+        public int getNnodes() {
+            return nnodes;
+        }
+
+        public void setNnodes(int nnodes) {
+            if (nnodes <= 0) {
+                logger.warn("Invalid torchrun.nnodes:{}, reset to 1", nnodes);
+                return;
+            }
+            this.nnodes = nnodes;
+        }
+
+        public int getNprocPerNode() {
+            return nprocPerNode;
+        }
+
+        public void setNprocPerNode(int nprocPerNode) {
+            if (nprocPerNode <= 0) {
+                logger.warn("Invalid torchrun.nproc-per-node:{}, reset to 1", nprocPerNode);
+                return;
+            }
+            this.nprocPerNode = nprocPerNode;
+        }
+
+        public String getRdzvId() {
+            return rdzvId;
+        }
+
+        public void setRdzvId(String rdzvId) {
+            this.rdzvId = rdzvId;
+        }
+
+        public String getRdzvEndpoint() {
+            return rdzvEndpoint;
+        }
+
+        public void setRdzvEndpoint(String rdzvEndpoint) {
+            this.rdzvEndpoint = rdzvEndpoint;
+        }
+
+        public String getRdzvBackend() {
+            return rdzvBackend;
+        }
+
+        public void setRdzvBackend(String rdzvBackend) {
+            this.rdzvBackend = rdzvBackend;
+        }
+
+        public String getRdzvConf() {
+            return rdzvConf;
+        }
+
+        public void setRdzvConf(String rdzvConf) {
+            this.rdzvConf = rdzvConf;
+        }
+
+        public int getMonitorInterval() {
+            return monitorInterval;
+        }
+
+        public void setMonitorInterval(int monitorInterval) {
+            if (monitorInterval <= 0) {
+                logger.warn("Invalid torchrun.monitor-interval:{}, reset to 5", monitorInterval);
+                return;
+            }
+            this.monitorInterval = monitorInterval;
+        }
+
+        public int getNodeRank() {
+            return nodeRank;
+        }
+
+        public void setNodeRank(int nodeRank) {
+            if (nodeRank < 0) {
+                logger.warn("Invalid torchrun.node-rank:{}, reset to 0", nodeRank);
+                return;
+            }
+            this.nodeRank = nodeRank;
+        }
+
+        public String getMasterAddr() {
+            return masterAddr;
+        }
+
+        public void setMasterAddr(String masterAddr) {
+            this.masterAddr = masterAddr;
+        }
+
+        public int getMasterPort() {
+            return masterPort;
+        }
+
+        public void setMasterPort(int masterPort) {
+            this.masterPort = masterPort;
+        }
+
+        public int getOmpNumberThreads() {
+            return ompNumberThreads;
+        }
+
+        public void setOmpNumberThreads(int ompNumberThreads) {
+            if (ompNumberThreads < 1) {
+                logger.warn("Invalid OMP_NUMBER_THREADS:{}, reset to 1", ompNumberThreads);
+                return;
+            }
+            this.ompNumberThreads = ompNumberThreads;
+        }
+    }
+}
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/BinaryUtils.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/BinaryUtils.java
index a41a58dee0..a0c941b5fe 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/BinaryUtils.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/BinaryUtils.java
@@ -36,7 +36,7 @@ public static String toHex(byte[] data) {
      */
     public static byte[] fromHex(String hexData) {
         byte[] result = new byte[(hexData.length() + 1) / 2];
-        String hexNumber = null;
+        String hexNumber;
         int stringOffset = 0;
         int byteOffset = 0;
         while (stringOffset < hexData.length()) {
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/HttpUtils.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/HttpUtils.java
index 5a469d2513..8c03f78875 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/HttpUtils.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/s3/HttpUtils.java
@@ -22,7 +22,7 @@ private HttpUtils() {}
     public static void copyURLToFile(URL endpointUrl, File modelLocation, boolean s3SseKmsEnabled)
             throws IOException {
         // for a simple GET, we have no body so supply the precomputed 'empty' hash
-        Map<String, String> headers = null;
+        Map<String, String> headers;
         if (s3SseKmsEnabled) {
             String awsAccessKey = System.getenv("AWS_ACCESS_KEY_ID");
             String awsSecretKey = System.getenv("AWS_SECRET_ACCESS_KEY");
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ArchiveUtils.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ArchiveUtils.java
index f752337608..82c4681dd6 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ArchiveUtils.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ArchiveUtils.java
@@ -13,11 +13,15 @@
 import java.nio.file.FileAlreadyExistsException;
 import java.nio.file.Files;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Pattern;
 import org.apache.commons.io.FileUtils;
 import org.pytorch.serve.archive.DownloadArchiveException;
 import org.pytorch.serve.archive.model.InvalidModelException;
 import org.pytorch.serve.archive.s3.HttpUtils;
+import org.yaml.snakeyaml.Yaml;
+import org.yaml.snakeyaml.constructor.Constructor;
+import org.yaml.snakeyaml.error.YAMLException;
 
 public final class ArchiveUtils {
 
@@ -39,6 +43,32 @@ public static <T> T readFile(File file, Class<T> type)
         }
     }
 
+    public static <T> T readYamlFile(File file, Class<T> type)
+            throws InvalidModelException, IOException {
+        Yaml yaml = new Yaml(new Constructor(type));
+        try (Reader r =
+                new InputStreamReader(
+                        Files.newInputStream(file.toPath()), StandardCharsets.UTF_8)) {
+
+            return yaml.load(r);
+        } catch (YAMLException e) {
+            throw new InvalidModelException("Failed to parse model config yaml file.", e);
+        }
+    }
+
+    public static Map<String, Object> readYamlFile(File file)
+            throws InvalidModelException, IOException {
+        Yaml yaml = new Yaml();
+        try (Reader r =
+                new InputStreamReader(
+                        Files.newInputStream(file.toPath()), StandardCharsets.UTF_8)) {
+
+            return yaml.load(r);
+        } catch (YAMLException e) {
+            throw new InvalidModelException("Failed to parse model config yaml file.", e);
+        }
+    }
+
     public static boolean validateURL(List<String> allowedUrls, String url)
             throws InvalidArchiveURLException {
         boolean patternMatch = false;
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ZipUtils.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ZipUtils.java
index bc1a3bf77c..68014fc98d 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ZipUtils.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/utils/ZipUtils.java
@@ -14,6 +14,9 @@
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 import java.util.zip.ZipOutputStream;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 
@@ -66,7 +69,8 @@ public static void addToZip(int prefix, File file, FileFilter filter, ZipOutputS
         }
     }
 
-    public static File unzip(InputStream is, String eTag, String type) throws IOException {
+    public static File unzip(InputStream is, String eTag, String type, boolean isMar)
+            throws IOException {
         File tmpDir = FileUtils.getTempDirectory();
         File modelDir = new File(tmpDir, type);
         FileUtils.forceMkdir(modelDir);
@@ -81,7 +85,11 @@ public static File unzip(InputStream is, String eTag, String type) throws IOExce
         } catch (NoSuchAlgorithmException e) {
             throw new AssertionError(e);
         }
-        unzip(new DigestInputStream(is, md), tmp);
+        if (isMar) {
+            unzip(new DigestInputStream(is, md), tmp);
+        } else {
+            decompressTarGzipFile(new DigestInputStream(is, md), tmp);
+        }
         if (eTag == null) {
             eTag = UUID.randomUUID().toString().replaceAll("-", "");
         }
@@ -92,4 +100,24 @@ public static File unzip(InputStream is, String eTag, String type) throws IOExce
 
         return dir;
     }
+
+    public static void decompressTarGzipFile(InputStream is, File dest) throws IOException {
+        try (GzipCompressorInputStream gzi = new GzipCompressorInputStream(is);
+                TarArchiveInputStream tis = new TarArchiveInputStream(gzi)) {
+            ArchiveEntry entry;
+            while ((entry = tis.getNextEntry()) != null) {
+                String name = entry.getName().substring(entry.getName().indexOf('/') + 1);
+                File file = new File(dest, name);
+                if (entry.isDirectory()) {
+                    FileUtils.forceMkdir(file);
+                } else {
+                    File parentFile = file.getParentFile();
+                    FileUtils.forceMkdir(parentFile);
+                    try (OutputStream os = Files.newOutputStream(file.toPath())) {
+                        IOUtils.copy(tis, os);
+                    }
+                }
+            }
+        }
+    }
 }
diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/workflow/WorkflowArchive.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/workflow/WorkflowArchive.java
index fca6879645..aaf185528e 100644
--- a/frontend/archive/src/main/java/org/pytorch/serve/archive/workflow/WorkflowArchive.java
+++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/workflow/WorkflowArchive.java
@@ -73,7 +73,12 @@ public static WorkflowArchive downloadWorkflow(
 
         if (workflowLocation.isFile()) {
             try (InputStream is = Files.newInputStream(workflowLocation.toPath())) {
-                File unzipDir = ZipUtils.unzip(is, null, "workflows");
+                File unzipDir;
+                if (workflowLocation.getName().endsWith(".war")) {
+                    unzipDir = ZipUtils.unzip(is, null, "workflows", true);
+                } else {
+                    unzipDir = ZipUtils.unzip(is, null, "workflows", false);
+                }
                 return load(url, unzipDir, true);
             }
         }
@@ -86,7 +91,7 @@ private static WorkflowArchive load(String url, File dir, boolean extracted)
         boolean failed = true;
         try {
             File manifestFile = new File(dir, "WAR-INF/" + MANIFEST_FILE);
-            Manifest manifest = null;
+            Manifest manifest;
             if (manifestFile.exists()) {
                 manifest = readFile(manifestFile, Manifest.class);
             } else {
diff --git a/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelArchiveTest.java b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelArchiveTest.java
index efa8a5dfad..4770fff384 100644
--- a/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelArchiveTest.java
+++ b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelArchiveTest.java
@@ -207,4 +207,18 @@ public void testMalformLocalURL()
         ModelArchive.downloadModel(
                 ALLOWED_URLS_LIST, modelStore, "file:///" + modelStore + "/mnist1.mar");
     }
+
+    @Test
+    public void testArchiveFormatTgz()
+            throws ModelException, IOException, DownloadArchiveException {
+        String modelStore = "src/test/resources/models";
+        ModelArchive archive =
+                ModelArchive.downloadModel(ALLOWED_URLS_LIST, modelStore, "noop.tar.gz");
+
+        archive.validate();
+        Assert.assertTrue(new File(archive.getModelDir().getPath(), "extra1.txt").exists());
+        Assert.assertTrue(new File(archive.getModelDir().getPath(), "sub1").isDirectory());
+
+        archive.clean();
+    }
 }
diff --git a/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java
new file mode 100644
index 0000000000..bc567165a5
--- /dev/null
+++ b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java
@@ -0,0 +1,50 @@
+package org.pytorch.serve.archive.model;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+import org.pytorch.serve.archive.utils.ArchiveUtils;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class ModelConfigTest {
+    @Test
+    public void testValidYamlConfig() throws InvalidModelException, IOException {
+        String yamlConfigFile = "src/test/resources/modelConfig/valid.yaml";
+        ModelConfig modelConfig;
+        File configFile = new File(yamlConfigFile);
+        Map<String, Object> modelConfigMap = ArchiveUtils.readYamlFile(configFile);
+        modelConfig = ModelConfig.build(modelConfigMap);
+
+        Assert.assertEquals(modelConfig.getMinWorkers(), 1);
+        Assert.assertEquals(modelConfig.getMaxWorkers(), 1);
+        Assert.assertEquals(modelConfig.getBatchSize(), 1);
+        Assert.assertEquals(modelConfig.getMaxBatchDelay(), 100);
+        Assert.assertEquals(modelConfig.getResponseTimeout(), 120);
+        Assert.assertEquals(modelConfig.getDeviceType(), ModelConfig.DeviceType.GPU);
+        Assert.assertEquals(modelConfig.getParallelLevel(), 4);
+        Assert.assertEquals(modelConfig.getParallelType(), ModelConfig.ParallelType.PP);
+        Assert.assertEquals(modelConfig.getDeviceIds().get(2).intValue(), 2);
+        Assert.assertEquals(modelConfig.getTorchRun().getNodeRank(), 0);
+        Assert.assertEquals(modelConfig.getTorchRun().getRdzvBackend(), "c10d");
+    }
+
+    @Test
+    public void testInvalidYamlConfig() throws InvalidModelException, IOException {
+        String yamlConfigFile = "src/test/resources/modelConfig/invalid.yaml";
+        ModelConfig modelConfig;
+        File configFile = new File(yamlConfigFile);
+        Map<String, Object> modelConfigMap = ArchiveUtils.readYamlFile(configFile);
+        modelConfig = ModelConfig.build(modelConfigMap);
+
+        Assert.assertNotEquals(modelConfig.getMinWorkers(), 1);
+        Assert.assertEquals(modelConfig.getMaxWorkers(), 1);
+        Assert.assertEquals(modelConfig.getBatchSize(), 1);
+        Assert.assertEquals(modelConfig.getMaxBatchDelay(), 100);
+        Assert.assertEquals(modelConfig.getResponseTimeout(), 120);
+        Assert.assertNotEquals(modelConfig.getDeviceType(), ModelConfig.DeviceType.GPU);
+        Assert.assertEquals(modelConfig.getParallelLevel(), 1);
+        Assert.assertNotEquals(modelConfig.getParallelType(), ModelConfig.ParallelType.PPTP);
+        Assert.assertNull(modelConfig.getDeviceIds());
+    }
+}
diff --git a/frontend/archive/src/test/resources/modelConfig/invalid.yaml b/frontend/archive/src/test/resources/modelConfig/invalid.yaml
new file mode 100644
index 0000000000..014b50bc05
--- /dev/null
+++ b/frontend/archive/src/test/resources/modelConfig/invalid.yaml
@@ -0,0 +1,9 @@
+# TS Frontend parameters
+minWorkers: a
+maxWorkers: 1
+batchSize: 1
+maxBatchDelay: 100
+responseTimeout: 120
+deviceType: xpu # cpu, gpu
+deviceIds: 0,1,2,3] # device index for gpu
+parallelType: "xpp" # pp: pipeline parallel; pptp: tensor+pipeline parallel
\ No newline at end of file
diff --git a/frontend/archive/src/test/resources/modelConfig/valid.yaml b/frontend/archive/src/test/resources/modelConfig/valid.yaml
new file mode 100644
index 0000000000..45995f451a
--- /dev/null
+++ b/frontend/archive/src/test/resources/modelConfig/valid.yaml
@@ -0,0 +1,13 @@
+# TS Frontend parameters
+minWorkers: 1
+maxWorkers: 1
+batchSize: 1
+maxBatchDelay: 100
+responseTimeout: 120
+deviceType: "gpu" # cpu, gpu
+deviceIds: [0,1,2,3] # device index for gpu
+parallelType: "pp" # pp: pipeline parallel; pptp: tensor+pipeline parallel
+
+torchrun:
+  nproc-per-node: 4
+
diff --git a/frontend/archive/src/test/resources/models/echo-client-timeout.mar b/frontend/archive/src/test/resources/models/echo-client-timeout.mar
new file mode 100644
index 0000000000..a2700fac81
Binary files /dev/null and b/frontend/archive/src/test/resources/models/echo-client-timeout.mar differ
diff --git a/frontend/archive/src/test/resources/models/init-error.mar b/frontend/archive/src/test/resources/models/init-error.mar
index ad91720b58..9fd2f24cfb 100644
Binary files a/frontend/archive/src/test/resources/models/init-error.mar and b/frontend/archive/src/test/resources/models/init-error.mar differ
diff --git a/frontend/archive/src/test/resources/models/mnist_scripted.mar b/frontend/archive/src/test/resources/models/mnist_scripted.mar
index 3973db8ee3..80c6bde76e 100644
Binary files a/frontend/archive/src/test/resources/models/mnist_scripted.mar and b/frontend/archive/src/test/resources/models/mnist_scripted.mar differ
diff --git a/frontend/archive/src/test/resources/models/noop.tar.gz b/frontend/archive/src/test/resources/models/noop.tar.gz
new file mode 100644
index 0000000000..cd2cd5eeb7
Binary files /dev/null and b/frontend/archive/src/test/resources/models/noop.tar.gz differ
diff --git a/frontend/archive/testng.xml b/frontend/archive/testng.xml
index 16540e207d..0d050dfbcd 100644
--- a/frontend/archive/testng.xml
+++ b/frontend/archive/testng.xml
@@ -5,6 +5,7 @@
     <classes>
       <class name="org.pytorch.serve.archive.CoverageTest"/>
       <class name="org.pytorch.serve.archive.model.ModelArchiveTest"/>
+      <class name="org.pytorch.serve.archive.model.ModelConfigTest"/>
       <class name="org.pytorch.serve.archive.workflow.WorkFlowArchiveTest"/>
     </classes>
   </test>
diff --git a/frontend/gradle.properties b/frontend/gradle.properties
index 4166db7039..91ec64fe6e 100644
--- a/frontend/gradle.properties
+++ b/frontend/gradle.properties
@@ -11,4 +11,4 @@ torchserve_sdk_version=0.0.4
 snakeyaml_version=1.31
 grpc_version=1.50.0
 protoc_version=3.18.0
-lmax_disruptor_version=3.4.4
+lmax_disruptor_version=3.4.4
\ No newline at end of file
diff --git a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java
index 2ceee181d6..3d5a0435cc 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java
@@ -38,6 +38,7 @@
 import org.pytorch.serve.archive.model.ModelNotFoundException;
 import org.pytorch.serve.grpcimpl.GRPCInterceptor;
 import org.pytorch.serve.grpcimpl.GRPCServiceFactory;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.metrics.MetricManager;
 import org.pytorch.serve.servingsdk.ModelServerEndpoint;
 import org.pytorch.serve.servingsdk.annotations.Endpoint;
@@ -48,29 +49,31 @@
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.Connector;
 import org.pytorch.serve.util.ConnectorType;
+import org.pytorch.serve.util.GPUManager;
 import org.pytorch.serve.util.ServerGroups;
 import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
 import org.pytorch.serve.wlm.WorkLoadManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.pytorch.serve.workflow.WorkflowManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class ModelServer {
-
     private Logger logger = LoggerFactory.getLogger(ModelServer.class);
-
     private ServerGroups serverGroups;
     private Server inferencegRPCServer;
     private Server managementgRPCServer;
     private List<ChannelFuture> futures = new ArrayList<>(2);
     private AtomicBoolean stopped = new AtomicBoolean(false);
     private ConfigManager configManager;
+    private GPUManager gpuManager;
     public static final int MAX_RCVBUF_SIZE = 4096;
 
     /** Creates a new {@code ModelServer} instance. */
-    public ModelServer(ConfigManager configManager) {
+    public ModelServer(ConfigManager configManager, GPUManager gpuManager) {
         this.configManager = configManager;
+        this.gpuManager = gpuManager;
         serverGroups = new ServerGroups(configManager);
     }
 
@@ -83,8 +86,11 @@ public static void main(String[] args) {
             ConfigManager.init(arguments);
             ConfigManager configManager = ConfigManager.getInstance();
             PluginsManager.getInstance().initialize();
+            MetricCache.init();
+            GPUManager.init(configManager);
+            GPUManager gpuManager = GPUManager.getInstance();
             InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE);
-            ModelServer modelServer = new ModelServer(configManager);
+            ModelServer modelServer = new ModelServer(configManager, gpuManager);
 
             Runtime.getRuntime()
                     .addShutdownHook(
@@ -119,7 +125,10 @@ public void startAndWait()
             startGRPCServers();
 
             // Create and schedule metrics manager
-            MetricManager.scheduleMetrics(configManager);
+            if (!configManager.isSystemMetricsDisabled()) {
+                MetricManager.scheduleMetrics(configManager);
+            }
+
             System.out.println("Model server started."); // NOPMD
 
             channelFutures.get(0).sync();
@@ -141,7 +150,7 @@ private String getDefaultModelName(String name) {
     }
 
     private void initModelStore() throws InvalidSnapshotException, IOException {
-        WorkLoadManager wlm = new WorkLoadManager(configManager, serverGroups.getBackendGroup());
+        WorkLoadManager wlm = new WorkLoadManager(configManager, gpuManager, serverGroups.getBackendGroup());
         ModelManager.init(configManager, wlm);
         WorkflowManager.init(configManager);
         SnapshotManager.init(configManager);
@@ -192,26 +201,44 @@ private void initModelStore() throws InvalidSnapshotException, IOException {
 
                         ModelArchive archive =
                                 modelManager.registerModel(file.getName(), defaultModelName);
-                        modelManager.updateModel(
-                                archive.getModelName(),
-                                archive.getModelVersion(),
+                        int minWorkers =
                                 configManager.getJsonIntValue(
                                         archive.getModelName(),
                                         archive.getModelVersion(),
                                         Model.MIN_WORKERS,
-                                        workers),
+                                        workers);
+                        int maxWorkers =
                                 configManager.getJsonIntValue(
                                         archive.getModelName(),
                                         archive.getModelVersion(),
                                         Model.MAX_WORKERS,
-                                        workers),
+                                        workers);
+                        if (archive.getModelConfig() != null) {
+                            int marMinWorkers = archive.getModelConfig().getMinWorkers();
+                            int marMaxWorkers = archive.getModelConfig().getMaxWorkers();
+                            if (marMinWorkers > 0 && marMaxWorkers >= marMinWorkers) {
+                                minWorkers = marMinWorkers;
+                                maxWorkers = marMaxWorkers;
+                            } else {
+                                logger.warn(
+                                        "Invalid model config in mar, minWorkers:{}, maxWorkers:{}",
+                                        marMinWorkers,
+                                        marMaxWorkers);
+                            }
+                        }
+                        modelManager.updateModel(
+                                archive.getModelName(),
+                                archive.getModelVersion(),
+                                minWorkers,
+                                maxWorkers,
                                 true,
                                 false);
                         startupModels.add(archive.getModelName());
                     } catch (ModelException
                             | IOException
                             | InterruptedException
-                            | DownloadArchiveException e) {
+                            | DownloadArchiveException
+                            | WorkerInitializationException e) {
                         logger.warn("Failed to load model: " + file.getAbsolutePath(), e);
                     }
                 }
@@ -251,26 +278,44 @@ private void initModelStore() throws InvalidSnapshotException, IOException {
                                 false,
                                 false,
                                 false);
-                modelManager.updateModel(
-                        archive.getModelName(),
-                        archive.getModelVersion(),
+                int minWorkers =
                         configManager.getJsonIntValue(
                                 archive.getModelName(),
                                 archive.getModelVersion(),
                                 Model.MIN_WORKERS,
-                                workers),
+                                workers);
+                int maxWorkers =
                         configManager.getJsonIntValue(
                                 archive.getModelName(),
                                 archive.getModelVersion(),
                                 Model.MAX_WORKERS,
-                                workers),
+                                workers);
+                if (archive.getModelConfig() != null) {
+                    int marMinWorkers = archive.getModelConfig().getMinWorkers();
+                    int marMaxWorkers = archive.getModelConfig().getMaxWorkers();
+                    if (marMinWorkers > 0 && marMaxWorkers >= marMinWorkers) {
+                        minWorkers = marMinWorkers;
+                        maxWorkers = marMaxWorkers;
+                    } else {
+                        logger.warn(
+                                "Invalid model config in mar, minWorkers:{}, maxWorkers:{}",
+                                marMinWorkers,
+                                marMaxWorkers);
+                    }
+                }
+                modelManager.updateModel(
+                        archive.getModelName(),
+                        archive.getModelVersion(),
+                        minWorkers,
+                        maxWorkers,
                         true,
                         false);
                 startupModels.add(archive.getModelName());
             } catch (ModelException
                     | IOException
                     | InterruptedException
-                    | DownloadArchiveException e) {
+                    | DownloadArchiveException
+                    | WorkerInitializationException e) {
                 logger.warn("Failed to load model: " + url, e);
             }
         }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/InferenceImpl.java b/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/InferenceImpl.java
index 7d0e5159de..1aef4a2559 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/InferenceImpl.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/InferenceImpl.java
@@ -6,6 +6,8 @@
 import io.grpc.stub.ServerCallStreamObserver;
 import io.grpc.stub.StreamObserver;
 import java.net.HttpURLConnection;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 import java.util.UUID;
 import org.pytorch.serve.archive.model.ModelNotFoundException;
@@ -19,12 +21,15 @@
 import org.pytorch.serve.http.StatusResponse;
 import org.pytorch.serve.job.GRPCJob;
 import org.pytorch.serve.job.Job;
-import org.pytorch.serve.metrics.api.MetricAggregator;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.ApiUtils;
+import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.JsonUtils;
 import org.pytorch.serve.util.messages.InputParameter;
 import org.pytorch.serve.util.messages.RequestInput;
 import org.pytorch.serve.util.messages.WorkerCommands;
+import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -45,13 +50,19 @@ public void ping(Empty request, StreamObserver<TorchServeHealthResponse> respons
                         });
         Runnable r =
                 () -> {
-                    String response = ApiUtils.getWorkerStatus();
+                    boolean isHealthy = ApiUtils.isModelHealthy();
+                    int code = HttpURLConnection.HTTP_OK;
+                    String response = "Healthy";
+                    if (!isHealthy) {
+                        response = "Unhealthy";
+                        code = HttpURLConnection.HTTP_INTERNAL_ERROR;
+                    }
+
                     TorchServeHealthResponse reply =
                             TorchServeHealthResponse.newBuilder()
                                     .setHealth(
                                             JsonUtils.GSON_PRETTY_EXPOSED.toJson(
-                                                    new StatusResponse(
-                                                            response, HttpURLConnection.HTTP_OK)))
+                                                    new StatusResponse(response, code)))
                                     .build();
                     responseObserver.onNext(reply);
                     responseObserver.onCompleted();
@@ -62,6 +73,33 @@ public void ping(Empty request, StreamObserver<TorchServeHealthResponse> respons
     @Override
     public void predictions(
             PredictionsRequest request, StreamObserver<PredictionResponse> responseObserver) {
+        prediction(request, responseObserver, WorkerCommands.PREDICT);
+    }
+
+    @Override
+    public void streamPredictions(
+            PredictionsRequest request, StreamObserver<PredictionResponse> responseObserver) {
+        logger.info("streamPredictions get req");
+        prediction(request, responseObserver, WorkerCommands.STREAMPREDICT);
+    }
+
+    private void sendErrorResponse(
+            StreamObserver<PredictionResponse> responseObserver,
+            Status status,
+            Exception e,
+            String description) {
+        responseObserver.onError(
+                status.withDescription(e.getMessage())
+                        .augmentDescription(
+                                description == null ? e.getClass().getCanonicalName() : description)
+                        .withCause(e)
+                        .asRuntimeException());
+    }
+
+    private void prediction(
+            PredictionsRequest request,
+            StreamObserver<PredictionResponse> responseObserver,
+            WorkerCommands workerCmd) {
         ((ServerCallStreamObserver<PredictionResponse>) responseObserver)
                 .setOnCancelHandler(
                         () -> {
@@ -86,25 +124,42 @@ public void predictions(
 
         String requestId = UUID.randomUUID().toString();
         RequestInput inputData = new RequestInput(requestId);
+        try {
+            ModelManager modelManager = ModelManager.getInstance();
+            Model model = modelManager.getModel(modelName, modelVersion);
+            if (model == null) {
+                throw new ModelNotFoundException("Model not found: " + modelName);
+            }
+            inputData.setClientExpireTS(model.getClientTimeoutInMills());
 
-        for (Map.Entry<String, ByteString> entry : request.getInputMap().entrySet()) {
-            inputData.addParameter(
-                    new InputParameter(entry.getKey(), entry.getValue().toByteArray()));
-        }
+            for (Map.Entry<String, ByteString> entry : request.getInputMap().entrySet()) {
+                inputData.addParameter(
+                        new InputParameter(entry.getKey(), entry.getValue().toByteArray()));
+            }
+
+            IMetric inferenceRequestsTotalMetric =
+                    MetricCache.getInstance().getMetricFrontend("ts_inference_requests_total");
+            if (inferenceRequestsTotalMetric != null) {
+                List<String> inferenceRequestsTotalMetricDimensionValues =
+                        Arrays.asList(
+                                modelName,
+                                modelVersion == null ? "default" : modelVersion,
+                                ConfigManager.getInstance().getHostName());
+                try {
+                    inferenceRequestsTotalMetric.addOrUpdate(
+                            inferenceRequestsTotalMetricDimensionValues, 1);
+                } catch (Exception e) {
+                    logger.error(
+                            "Failed to update frontend metric ts_inference_requests_total: ", e);
+                }
+            }
 
-        MetricAggregator.handleInferenceMetric(modelName, modelVersion);
-        Job job =
-                new GRPCJob(
-                        responseObserver,
-                        modelName,
-                        modelVersion,
-                        WorkerCommands.PREDICT,
-                        inputData);
+            Job job = new GRPCJob(responseObserver, modelName, modelVersion, workerCmd, inputData);
 
-        try {
-            if (!ModelManager.getInstance().addJob(job)) {
+            if (!modelManager.addJob(job)) {
+                String priority = job.getPriority().toString();
                 String responseMessage =
-                        ApiUtils.getInferenceErrorResponseMessage(modelName, modelVersion);
+                        ApiUtils.getInferenceErrorResponseMessage(modelName, modelVersion, priority);
                 InternalServerException e = new InternalServerException(responseMessage);
                 sendErrorResponse(
                         responseObserver, Status.INTERNAL, e, "InternalServerException.()");
@@ -113,17 +168,4 @@ public void predictions(
             sendErrorResponse(responseObserver, Status.INTERNAL, e, null);
         }
     }
-
-    private void sendErrorResponse(
-            StreamObserver<PredictionResponse> responseObserver,
-            Status status,
-            Exception e,
-            String description) {
-        responseObserver.onError(
-                status.withDescription(e.getMessage())
-                        .augmentDescription(
-                                description == null ? e.getClass().getCanonicalName() : description)
-                        .withCause(e)
-                        .asRuntimeException());
-    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/ManagementImpl.java b/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/ManagementImpl.java
index a034f600e0..f254729b13 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/ManagementImpl.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/grpcimpl/ManagementImpl.java
@@ -27,6 +27,7 @@
 import org.pytorch.serve.util.JsonUtils;
 import org.pytorch.serve.util.messages.RequestInput;
 import org.pytorch.serve.wlm.ModelManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -117,7 +118,7 @@ public void registerModel(
             sendStatusResponse(responseObserver, statusResponse);
         } catch (InternalServerException e) {
             sendException(responseObserver, e, null);
-        } catch (ExecutionException | InterruptedException e) {
+        } catch (ExecutionException | InterruptedException | WorkerInitializationException e) {
             sendException(responseObserver, e, "Error while creating workers");
         } catch (ModelNotFoundException | ModelVersionNotFoundException e) {
             sendErrorResponse(responseObserver, Status.NOT_FOUND, e);
@@ -156,7 +157,7 @@ public void scaleWorker(
                             false,
                             null);
             sendStatusResponse(responseObserver, statusResponse);
-        } catch (ExecutionException | InterruptedException e) {
+        } catch (ExecutionException | InterruptedException | WorkerInitializationException e) {
             sendException(responseObserver, e, "Error while creating workers");
         } catch (ModelNotFoundException | ModelVersionNotFoundException e) {
             sendErrorResponse(responseObserver, Status.NOT_FOUND, e);
@@ -230,6 +231,14 @@ private void sendResponse(StreamObserver<ManagementResponse> responseObserver, S
         responseObserver.onCompleted();
     }
 
+    public static void sendErrorResponse(
+            StreamObserver<ManagementResponse> responseObserver, Status status, Exception e) {
+        responseObserver.onError(
+                status.withDescription(e.getMessage())
+                        .augmentDescription(e.getClass().getCanonicalName())
+                        .asRuntimeException());
+    }
+
     private void sendErrorResponse(
             StreamObserver<ManagementResponse> responseObserver,
             Status status,
@@ -241,14 +250,6 @@ private void sendErrorResponse(
                         .asRuntimeException());
     }
 
-    public static void sendErrorResponse(
-            StreamObserver<ManagementResponse> responseObserver, Status status, Exception e) {
-        responseObserver.onError(
-                status.withDescription(e.getMessage())
-                        .augmentDescription(e.getClass().getCanonicalName())
-                        .asRuntimeException());
-    }
-
     private void sendStatusResponse(
             StreamObserver<ManagementResponse> responseObserver, StatusResponse statusResponse) {
         int httpResponseStatusCode = statusResponse.getHttpResponseCode();
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/HttpRequestHandlerChain.java b/frontend/server/src/main/java/org/pytorch/serve/http/HttpRequestHandlerChain.java
index 8a381bcfca..7219c5b81b 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/HttpRequestHandlerChain.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/HttpRequestHandlerChain.java
@@ -20,6 +20,7 @@
 import org.pytorch.serve.servingsdk.impl.ModelServerResponse;
 import org.pytorch.serve.util.NettyUtils;
 import org.pytorch.serve.wlm.ModelManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -45,7 +46,7 @@ public abstract void handleRequest(
             QueryStringDecoder decoder,
             String[] segments)
             throws ModelNotFoundException, ModelException, DownloadArchiveException,
-                    WorkflowException;
+                    WorkflowException, WorkerInitializationException;
 
     private void run(
             ModelServerEndpoint endpoint,
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/Session.java b/frontend/server/src/main/java/org/pytorch/serve/http/Session.java
index 11247331f9..1e0496e0ab 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/Session.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/Session.java
@@ -1,6 +1,7 @@
 package org.pytorch.serve.http;
 
 import io.netty.handler.codec.http.HttpRequest;
+import io.netty.handler.codec.http.HttpHeaders;
 import java.util.UUID;
 
 public class Session {
@@ -23,7 +24,14 @@ public Session(String remoteIp, HttpRequest request) {
             method = "GET";
             protocol = "HTTP/1.1";
         }
-        requestId = UUID.randomUUID().toString();
+
+        HttpHeaders headers = request.headers();
+        if (headers.contains("x-request-id")) {
+            // adopt header value as prefix for internal request id
+            requestId = headers.getAsString("x-request-id") + ":" + UUID.randomUUID().toString();
+        } else {
+            requestId = UUID.randomUUID().toString();
+        }
         startTime = System.currentTimeMillis();
     }
 
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ApiDescriptionRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ApiDescriptionRequestHandler.java
index 9b99a826dd..05422dafa8 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ApiDescriptionRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ApiDescriptionRequestHandler.java
@@ -12,6 +12,7 @@
 import org.pytorch.serve.openapi.OpenApiUtils;
 import org.pytorch.serve.util.ConnectorType;
 import org.pytorch.serve.util.NettyUtils;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 
 public class ApiDescriptionRequestHandler extends HttpRequestHandlerChain {
 
@@ -27,7 +28,8 @@ public void handleRequest(
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
 
         if (isApiDescription(segments)) {
             String path = decoder.path();
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/InferenceRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/InferenceRequestHandler.java
index 10774a1206..11936afd91 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/InferenceRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/InferenceRequestHandler.java
@@ -10,6 +10,7 @@
 import io.netty.handler.codec.http.multipart.HttpDataFactory;
 import io.netty.handler.codec.http.multipart.HttpPostRequestDecoder;
 import java.net.HttpURLConnection;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import org.pytorch.serve.archive.DownloadArchiveException;
@@ -21,7 +22,8 @@
 import org.pytorch.serve.http.HttpRequestHandlerChain;
 import org.pytorch.serve.http.ResourceNotFoundException;
 import org.pytorch.serve.http.StatusResponse;
-import org.pytorch.serve.metrics.api.MetricAggregator;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.openapi.OpenApiUtils;
 import org.pytorch.serve.servingsdk.ModelServerEndpoint;
 import org.pytorch.serve.util.ApiUtils;
@@ -31,6 +33,7 @@
 import org.pytorch.serve.util.messages.RequestInput;
 import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -54,7 +57,8 @@ public void handleRequest(
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
         if (isInferenceReq(segments)) {
             if (endpointMap.getOrDefault(segments[1], null) != null) {
                 handleCustomEndpoint(ctx, req, segments, decoder);
@@ -63,11 +67,15 @@ public void handleRequest(
                     case "ping":
                         Runnable r =
                                 () -> {
-                                    String response = ApiUtils.getWorkerStatus();
+                                    boolean isHealthy = ApiUtils.isModelHealthy();
+                                    int code = HttpURLConnection.HTTP_OK;
+                                    String response = "Healthy";
+                                    if (!isHealthy) {
+                                        response = "Unhealthy";
+                                        code = HttpURLConnection.HTTP_INTERNAL_ERROR;
+                                    }
                                     NettyUtils.sendJsonResponse(
-                                            ctx,
-                                            new StatusResponse(
-                                                    response, HttpURLConnection.HTTP_OK));
+                                            ctx, new StatusResponse(response, code));
                                 };
                         ApiUtils.getTorchServeHealth(r);
                         break;
@@ -241,21 +249,35 @@ private void predict(
                 throw new BadRequestException("Parameter model_name is required.");
             }
         }
+        ModelManager modelManager = ModelManager.getInstance();
+        Model model = modelManager.getModel(modelName, modelVersion);
+        if (model == null) {
+            throw new ModelNotFoundException("Model not found: " + modelName);
+        }
+        input.setClientExpireTS(model.getClientTimeoutInMills());
 
         if (HttpMethod.OPTIONS.equals(req.method())) {
-            ModelManager modelManager = ModelManager.getInstance();
-
-            Model model = modelManager.getModel(modelName, modelVersion);
-            if (model == null) {
-                throw new ModelNotFoundException("Model not found: " + modelName);
-            }
-
             String resp = OpenApiUtils.getModelApi(model);
             NettyUtils.sendJsonResponse(ctx, resp);
             return;
         }
 
-        MetricAggregator.handleInferenceMetric(modelName, modelVersion);
+        IMetric inferenceRequestsTotalMetric =
+                MetricCache.getInstance().getMetricFrontend("ts_inference_requests_total");
+        if (inferenceRequestsTotalMetric != null) {
+            List<String> inferenceRequestsTotalMetricDimensionValues =
+                    Arrays.asList(
+                            modelName,
+                            modelVersion == null ? "default" : modelVersion,
+                            ConfigManager.getInstance().getHostName());
+            try {
+                inferenceRequestsTotalMetric.addOrUpdate(
+                        inferenceRequestsTotalMetricDimensionValues, 1);
+            } catch (Exception e) {
+                logger.error("Failed to update frontend metric ts_inference_requests_total: ", e);
+            }
+        }
+
         ApiUtils.addRESTInferenceJob(ctx, modelName, modelVersion, input);
     }
 
@@ -274,7 +296,7 @@ private static RequestInput parseRequest(
 
         CharSequence contentType = HttpUtil.getMimeType(req);
         for (Map.Entry<String, String> entry : req.headers().entries()) {
-            inputData.updateHeaders(entry.getKey(), entry.getValue());
+            inputData.updateHeaders(entry.getKey().toLowerCase(), entry.getValue());
         }
 
         if (HttpPostRequestDecoder.isMultipart(req)
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ManagementRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ManagementRequestHandler.java
index 913708428f..29a6f156cf 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ManagementRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/ManagementRequestHandler.java
@@ -38,6 +38,7 @@
 import org.pytorch.serve.util.messages.WorkerCommands;
 import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.pytorch.serve.wlm.WorkerThread;
 
 /**
@@ -58,7 +59,8 @@ public void handleRequest(
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
         if (isManagementReq(segments)) {
             if (endpointMap.getOrDefault(segments[1], null) != null) {
                 handleCustomEndpoint(ctx, req, segments, decoder);
@@ -191,7 +193,7 @@ private KFV1ModelReadyResponse createKFV1ModelReadyResponse(
 
     private void handleRegisterModel(
             ChannelHandlerContext ctx, QueryStringDecoder decoder, FullHttpRequest req)
-            throws ModelException, DownloadArchiveException {
+            throws ModelException, DownloadArchiveException, WorkerInitializationException {
         RegisterModelRequest registerModelRequest = parseRequest(req, decoder);
         StatusResponse statusResponse;
         try {
@@ -225,7 +227,8 @@ private void handleScaleModel(
             QueryStringDecoder decoder,
             String modelName,
             String modelVersion)
-            throws ModelNotFoundException, ModelVersionNotFoundException {
+            throws ModelNotFoundException, ModelVersionNotFoundException,
+                    WorkerInitializationException {
         int minWorkers = NettyUtils.getIntParameter(decoder, "min_worker", 1);
         int maxWorkers = NettyUtils.getIntParameter(decoder, "max_worker", minWorkers);
         if (modelVersion == null) {
diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/PrometheusMetricsRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/PrometheusMetricsRequestHandler.java
index 41658e6909..9760babd46 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/PrometheusMetricsRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/http/api/rest/PrometheusMetricsRequestHandler.java
@@ -25,6 +25,7 @@
 import org.pytorch.serve.archive.workflow.WorkflowException;
 import org.pytorch.serve.http.HttpRequestHandlerChain;
 import org.pytorch.serve.util.NettyUtils;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,7 +45,8 @@ public void handleRequest(
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
         if (segments.length >= 2 && "metrics".equals(segments[1])) {
             ByteBuf resBuf = Unpooled.directBuffer();
             List<String> params =
diff --git a/frontend/server/src/main/java/org/pytorch/serve/job/GRPCJob.java b/frontend/server/src/main/java/org/pytorch/serve/job/GRPCJob.java
index 4735710226..364affd9fb 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/job/GRPCJob.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/job/GRPCJob.java
@@ -1,9 +1,13 @@
 package org.pytorch.serve.job;
 
+import static org.pytorch.serve.util.messages.RequestInput.TS_STREAM_NEXT;
+
 import com.google.protobuf.ByteString;
 import io.grpc.Status;
 import io.grpc.stub.StreamObserver;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
 import org.pytorch.serve.archive.model.ModelNotFoundException;
@@ -12,8 +16,8 @@
 import org.pytorch.serve.grpc.management.ManagementResponse;
 import org.pytorch.serve.grpcimpl.ManagementImpl;
 import org.pytorch.serve.http.messages.DescribeModelResponse;
-import org.pytorch.serve.metrics.Dimension;
-import org.pytorch.serve.metrics.Metric;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.ApiUtils;
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.GRPCUtils;
@@ -25,10 +29,9 @@
 
 public class GRPCJob extends Job {
     private static final Logger logger = LoggerFactory.getLogger(Job.class);
-    private static final Logger loggerTsMetrics =
-            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
-    private static final Dimension DIMENSION = new Dimension("Level", "Host");
 
+    private final IMetric queueTimeMetric;
+    private final List<String> queueTimeMetricDimensionValues;
     private StreamObserver<PredictionResponse> predictionResponseObserver;
     private StreamObserver<ManagementResponse> managementResponseObserver;
 
@@ -40,6 +43,9 @@ public GRPCJob(
             RequestInput input) {
         super(modelName, version, cmd, input);
         this.predictionResponseObserver = predictionResponseObserver;
+        this.queueTimeMetric = MetricCache.getInstance().getMetricFrontend("QueueTime");
+        this.queueTimeMetricDimensionValues =
+                Arrays.asList("Host", ConfigManager.getInstance().getHostName());
     }
 
     public GRPCJob(
@@ -49,6 +55,9 @@ public GRPCJob(
             RequestInput input) {
         super(modelName, version, WorkerCommands.DESCRIBE, input);
         this.managementResponseObserver = managementResponseObserver;
+        this.queueTimeMetric = MetricCache.getInstance().getMetricFrontend("QueueTime");
+        this.queueTimeMetricDimensionValues =
+                Arrays.asList("Host", ConfigManager.getInstance().getHostName());
     }
 
     @Override
@@ -60,28 +69,36 @@ public void response(
             Map<String, String> responseHeaders) {
 
         ByteString output = ByteString.copyFrom(body);
-        if (this.getCmd() == WorkerCommands.PREDICT) {
+        if (this.getCmd() == WorkerCommands.PREDICT
+                || this.getCmd() == WorkerCommands.STREAMPREDICT) {
             PredictionResponse reply =
                     PredictionResponse.newBuilder().setPrediction(output).build();
             predictionResponseObserver.onNext(reply);
-            predictionResponseObserver.onCompleted();
+            if (this.getCmd() == WorkerCommands.PREDICT
+                    || (this.getCmd() == WorkerCommands.STREAMPREDICT
+                            && responseHeaders.get(TS_STREAM_NEXT).equals("false"))) {
+                predictionResponseObserver.onCompleted();
 
-            logger.debug(
-                    "Waiting time ns: {}, Backend time ns: {}",
-                    getScheduled() - getBegin(),
-                    System.nanoTime() - getScheduled());
-            String queueTime =
-                    String.valueOf(
-                            TimeUnit.MILLISECONDS.convert(
-                                    getScheduled() - getBegin(), TimeUnit.NANOSECONDS));
-            loggerTsMetrics.info(
-                    "{}",
-                    new Metric(
-                            "QueueTime",
-                            queueTime,
-                            "ms",
-                            ConfigManager.getInstance().getHostName(),
-                            DIMENSION));
+                        // TODO Simon: During rebase this part was moved inside the if bloc,
+                        // verify that this is correct
+                logger.debug(
+                        "Waiting time ns: {}, Backend time ns: {}",
+                        getScheduled() - getBegin(),
+                        System.nanoTime() - getScheduled());
+                double queueTime =
+                        (double)
+                                TimeUnit.MILLISECONDS.convert(
+                                        getScheduled() - getBegin(), TimeUnit.NANOSECONDS);
+                if (this.queueTimeMetric != null) {
+                    try {
+                        this.queueTimeMetric.addOrUpdate(
+                                this.queueTimeMetricDimensionValues, queueTime);
+                    } catch (Exception e) {
+                        logger.error("Failed to update frontend metric QueueTime: ", e);
+                    }
+                }
+                // TODO Simon: we should probably also add, as we did before the queue priority here
+            }
         } else if (this.getCmd() == WorkerCommands.DESCRIBE) {
             try {
                 ArrayList<DescribeModelResponse> respList =
@@ -102,7 +119,8 @@ public void response(
     @Override
     public void sendError(int status, String error) {
         Status responseStatus = GRPCUtils.getGRPCStatusCode(status);
-        if (this.getCmd() == WorkerCommands.PREDICT) {
+        if (this.getCmd() == WorkerCommands.PREDICT
+                || this.getCmd() == WorkerCommands.STREAMPREDICT) {
             predictionResponseObserver.onError(
                     responseStatus
                             .withDescription(error)
diff --git a/frontend/server/src/main/java/org/pytorch/serve/job/Job.java b/frontend/server/src/main/java/org/pytorch/serve/job/Job.java
index d3c62c753e..6983a72af2 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/job/Job.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/job/Job.java
@@ -1,15 +1,20 @@
 package org.pytorch.serve.job;
 
+import static org.pytorch.serve.util.messages.RequestInput.TS_STREAM_NEXT;
+
 import java.util.Map;
 import org.pytorch.serve.util.messages.RequestInput;
 import org.pytorch.serve.util.messages.WorkerCommands;
+import org.pytorch.serve.util.Prioritisable;
+import org.pytorch.serve.util.Priority;
 
-public abstract class Job {
+public abstract class Job implements Prioritisable {
 
     private String modelName;
     private String modelVersion;
     private WorkerCommands cmd; // Else its data msg or inf requests
     private RequestInput input;
+    private Priority priority;
     private long begin;
     private long scheduled;
 
@@ -20,6 +25,19 @@ public Job(String modelName, String version, WorkerCommands cmd, RequestInput in
         this.modelVersion = version;
         begin = System.nanoTime();
         scheduled = begin;
+        if (cmd == WorkerCommands.STREAMPREDICT) {
+            input.updateHeaders(TS_STREAM_NEXT, "true");
+        }
+
+        this.priority = Priority.valueOf(input.getHeaders().getOrDefault("x-ts-priority", "MAX").toUpperCase());
+    }
+
+    public Priority getPriority() {
+        return this.priority;
+    }
+
+    public void setPriority(Priority priority) {
+        this.priority = priority;
     }
 
     public String getJobId() {
@@ -39,7 +57,9 @@ public WorkerCommands getCmd() {
     }
 
     public boolean isControlCmd() {
-        return !WorkerCommands.PREDICT.equals(cmd) && !WorkerCommands.DESCRIBE.equals(cmd);
+        return !WorkerCommands.PREDICT.equals(cmd)
+                && !WorkerCommands.STREAMPREDICT.equals(cmd)
+                && !WorkerCommands.DESCRIBE.equals(cmd);
     }
 
     public RequestInput getPayload() {
diff --git a/frontend/server/src/main/java/org/pytorch/serve/job/RestJob.java b/frontend/server/src/main/java/org/pytorch/serve/job/RestJob.java
index d382c32999..064167b692 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/job/RestJob.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/job/RestJob.java
@@ -1,15 +1,24 @@
 package org.pytorch.serve.job;
 
+import static org.pytorch.serve.util.messages.RequestInput.TS_STREAM_NEXT;
+
 import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
 import io.netty.channel.ChannelHandlerContext;
 import io.netty.handler.codec.http.DefaultFullHttpResponse;
+import io.netty.handler.codec.http.DefaultHttpContent;
+import io.netty.handler.codec.http.DefaultHttpResponse;
 import io.netty.handler.codec.http.FullHttpResponse;
 import io.netty.handler.codec.http.HttpHeaderNames;
 import io.netty.handler.codec.http.HttpHeaderValues;
+import io.netty.handler.codec.http.HttpResponse;
 import io.netty.handler.codec.http.HttpResponseStatus;
 import io.netty.handler.codec.http.HttpVersion;
+import io.netty.handler.codec.http.LastHttpContent;
 import io.netty.util.CharsetUtil;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.TimeUnit;
@@ -17,9 +26,8 @@
 import org.pytorch.serve.archive.model.ModelVersionNotFoundException;
 import org.pytorch.serve.http.InternalServerException;
 import org.pytorch.serve.http.messages.DescribeModelResponse;
-import org.pytorch.serve.metrics.Dimension;
-import org.pytorch.serve.metrics.Metric;
-import org.pytorch.serve.metrics.api.MetricAggregator;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.ApiUtils;
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.JsonUtils;
@@ -32,12 +40,19 @@
 public class RestJob extends Job {
 
     private static final Logger logger = LoggerFactory.getLogger(Job.class);
-    private static final Logger loggerTsMetrics =
-            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
-    private static final Dimension DIMENSION = new Dimension("Level", "Host");
 
+    private final IMetric inferenceLatencyMetric;
+    private final IMetric queueLatencyMetric;
+    private final List<String> latencyMetricDimensionValues;
+    private final IMetric queueTimeMetric;
+    private final List<String> queueTimeMetricDimensionValues;
     private ChannelHandlerContext ctx;
     private CompletableFuture<byte[]> responsePromise;
+    /**
+     * numStreams is used to track 4 cases -1: stream end 0: non-stream response (default use case)
+     * 1: the first stream response [2, max_integer]: the 2nd and more stream response
+     */
+    private int numStreams;
 
     public RestJob(
             ChannelHandlerContext ctx,
@@ -47,6 +62,19 @@ public RestJob(
             RequestInput input) {
         super(modelName, version, cmd, input);
         this.ctx = ctx;
+        this.inferenceLatencyMetric =
+                MetricCache.getInstance().getMetricFrontend("ts_inference_latency_microseconds");
+        this.queueLatencyMetric =
+                MetricCache.getInstance().getMetricFrontend("ts_queue_latency_microseconds");
+        this.latencyMetricDimensionValues =
+                Arrays.asList(
+                        getModelName(),
+                        getModelVersion() == null ? "default" : getModelVersion(),
+                        ConfigManager.getInstance().getHostName());
+        this.queueTimeMetric = MetricCache.getInstance().getMetricFrontend("QueueTime");
+        this.queueTimeMetricDimensionValues =
+                Arrays.asList("Host", ConfigManager.getInstance().getHostName());
+        this.numStreams = 0;
     }
 
     @Override
@@ -117,7 +145,14 @@ private void responseInference(
                 (statusPhrase == null)
                         ? HttpResponseStatus.valueOf(statusCode)
                         : new HttpResponseStatus(statusCode, statusPhrase);
-        FullHttpResponse resp = new DefaultFullHttpResponse(HttpVersion.HTTP_1_1, status, false);
+        HttpResponse resp;
+
+        if (responseHeaders != null && responseHeaders.containsKey(TS_STREAM_NEXT)) {
+            resp = new DefaultHttpResponse(HttpVersion.HTTP_1_1, status, false);
+            numStreams = responseHeaders.get(TS_STREAM_NEXT).equals("true") ? numStreams + 1 : -1;
+        } else {
+            resp = new DefaultFullHttpResponse(HttpVersion.HTTP_1_1, status, false);
+        }
 
         if (contentType != null && contentType.length() > 0) {
             resp.headers().set(HttpHeaderNames.CONTENT_TYPE, contentType);
@@ -127,7 +162,6 @@ private void responseInference(
                 resp.headers().set(e.getKey(), e.getValue());
             }
         }
-        resp.content().writeBytes(body);
 
         /*
          * We can load the models based on the configuration file.Since this Job is
@@ -136,29 +170,62 @@ private void responseInference(
          * by external clients.
          */
         if (ctx != null) {
-            MetricAggregator.handleInferenceMetric(
-                    getModelName(), getModelVersion(), getScheduled() - getBegin(), inferTime);
-            NettyUtils.sendHttpResponse(ctx, resp, true);
+            if (numStreams == 0) { // non-stream response
+                ((DefaultFullHttpResponse) resp).content().writeBytes(body);
+                NettyUtils.sendHttpResponse(ctx, resp, true);
+            } else if (numStreams == -1) { // the last response in a stream
+                ctx.writeAndFlush(new DefaultHttpContent(Unpooled.wrappedBuffer(body)));
+                ctx.writeAndFlush(LastHttpContent.EMPTY_LAST_CONTENT);
+            } else if (numStreams == 1) { // the first response in a stream
+                NettyUtils.sendHttpResponse(ctx, resp, true);
+                ctx.writeAndFlush(new DefaultHttpContent(Unpooled.wrappedBuffer(body)));
+            } else if (numStreams > 1) { // the 2nd+ response in a stream
+                ctx.writeAndFlush(new DefaultHttpContent(Unpooled.wrappedBuffer(body)));
+            }
         } else if (responsePromise != null) {
             responsePromise.complete(body);
         }
+        
+        if (numStreams <= 0) {
+            if (this.inferenceLatencyMetric != null) {
+                try {
+                    this.inferenceLatencyMetric.addOrUpdate(
+                            this.latencyMetricDimensionValues, inferTime / 1000.0);
+                } catch (Exception e) {
+                    logger.error(
+                            "Failed to update frontend metric ts_inference_latency_microseconds: ",
+                            e);
+                }
+            }
+            if (this.queueLatencyMetric != null) {
+                try {
+                    this.queueLatencyMetric.addOrUpdate(
+                            this.latencyMetricDimensionValues,
+                            (getScheduled() - getBegin()) / 1000.0);
+                } catch (Exception e) {
+                    logger.error(
+                            "Failed to update frontend metric ts_queue_latency_microseconds: ", e);
+                }
+            }
 
-        logger.debug(
-                "Waiting time ns: {}, Backend time ns: {}",
-                getScheduled() - getBegin(),
-                System.nanoTime() - getScheduled());
-        String queueTime =
-                String.valueOf(
-                        TimeUnit.MILLISECONDS.convert(
-                                getScheduled() - getBegin(), TimeUnit.NANOSECONDS));
-        loggerTsMetrics.info(
-                "{}",
-                new Metric(
-                        "QueueTime",
-                        queueTime,
-                        "ms",
-                        ConfigManager.getInstance().getHostName(),
-                        DIMENSION));
+            logger.debug(
+                    "Waiting time ns: {}, Backend time ns: {}",
+                    getScheduled() - getBegin(),
+                    System.nanoTime() - getScheduled());
+            double queueTime =
+                    (double)
+                            TimeUnit.MILLISECONDS.convert(
+                                    getScheduled() - getBegin(), TimeUnit.NANOSECONDS);
+            if (this.queueTimeMetric != null) {
+                try {
+                    this.queueTimeMetric.addOrUpdate(
+                            this.queueTimeMetricDimensionValues, queueTime);
+                } catch (Exception e) {
+                    logger.error("Failed to update frontend metric QueueTime: ", e);
+                }
+            }
+            // TODO Simon: we should probably also add, as we did before the queue priority here
+        }
     }
 
     @Override
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/IMetric.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/IMetric.java
new file mode 100644
index 0000000000..369fd71cc8
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/IMetric.java
@@ -0,0 +1,23 @@
+package org.pytorch.serve.metrics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public abstract class IMetric {
+    protected MetricBuilder.MetricType type;
+    protected String name;
+    protected String unit;
+    protected List<String> dimensionNames;
+
+    public IMetric(
+            MetricBuilder.MetricType type, String name, String unit, List<String> dimensionNames) {
+        this.type = type;
+        this.name = name;
+        this.unit = unit;
+        this.dimensionNames = new ArrayList<String>(dimensionNames);
+    }
+
+    public abstract void addOrUpdate(List<String> dimensionValues, double value);
+
+    public abstract void addOrUpdate(List<String> dimensionValues, String requestIds, double value);
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/LogMetric.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/LogMetric.java
new file mode 100644
index 0000000000..55007b80a8
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/LogMetric.java
@@ -0,0 +1,106 @@
+package org.pytorch.serve.metrics;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import org.pytorch.serve.util.ConfigManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class LogMetric extends IMetric {
+    /**
+     * Note: hostname, timestamp, and requestid(if available) are automatically added in log metric.
+     */
+    private static final Logger loggerTsMetrics =
+            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
+
+    private static final Logger loggerModelMetrics =
+            LoggerFactory.getLogger(ConfigManager.MODEL_METRICS_LOGGER);
+
+    public LogMetric(
+            MetricBuilder.MetricType type, String name, String unit, List<String> dimensionNames) {
+        super(type, name, unit, dimensionNames);
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, double value) {
+        // Used for logging frontend metrics
+        String metricString = this.buildMetricString(dimensionValues, value);
+        loggerTsMetrics.info(metricString);
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, String requestIds, double value) {
+        // Used for logging backend metrics
+        String metricString = this.buildMetricString(dimensionValues, requestIds, value);
+        loggerModelMetrics.info(metricString);
+    }
+
+    private String buildMetricString(List<String> dimensionValues, double value) {
+        StringBuilder metricStringBuilder = new StringBuilder();
+        metricStringBuilder
+                .append(this.name)
+                .append('.')
+                .append(this.unit)
+                .append(':')
+                .append(value)
+                .append("|#");
+
+        // Exclude the final dimension which is expected to be Hostname
+        int dimensionsCount = Math.min(this.dimensionNames.size() - 1, dimensionValues.size() - 1);
+        List<String> dimensions = new ArrayList<String>();
+        for (int index = 0; index < dimensionsCount; index++) {
+            dimensions.add(this.dimensionNames.get(index) + ":" + dimensionValues.get(index));
+        }
+        metricStringBuilder.append(dimensions.stream().collect(Collectors.joining(",")));
+
+        // The final dimension is expected to be Hostname
+        metricStringBuilder
+                .append("|#hostname:")
+                .append(dimensionValues.get(dimensionValues.size() - 1));
+
+        metricStringBuilder
+                .append(",timestamp:")
+                .append(
+                        String.valueOf(
+                                TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
+
+        return metricStringBuilder.toString();
+    }
+
+    private String buildMetricString(
+            List<String> dimensionValues, String requestIds, double value) {
+        StringBuilder metricStringBuilder = new StringBuilder();
+        metricStringBuilder
+                .append(this.name)
+                .append('.')
+                .append(this.unit)
+                .append(':')
+                .append(value)
+                .append("|#");
+
+        // Exclude the final dimension which is expected to be Hostname
+        int dimensionsCount = Math.min(this.dimensionNames.size() - 1, dimensionValues.size() - 1);
+        List<String> dimensions = new ArrayList<String>();
+        for (int index = 0; index < dimensionsCount; index++) {
+            dimensions.add(this.dimensionNames.get(index) + ":" + dimensionValues.get(index));
+        }
+        metricStringBuilder.append(dimensions.stream().collect(Collectors.joining(",")));
+
+        // The final dimension is expected to be Hostname
+        metricStringBuilder
+                .append("|#hostname:")
+                .append(dimensionValues.get(dimensionValues.size() - 1));
+
+        metricStringBuilder.append(",requestID:").append(requestIds);
+
+        metricStringBuilder
+                .append(",timestamp:")
+                .append(
+                        String.valueOf(
+                                TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
+
+        return metricStringBuilder.toString();
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricBuilder.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricBuilder.java
new file mode 100644
index 0000000000..c8e3a25040
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricBuilder.java
@@ -0,0 +1,43 @@
+package org.pytorch.serve.metrics;
+
+import java.util.List;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusCounter;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusGauge;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusHistogram;
+
+public final class MetricBuilder {
+    public enum MetricMode {
+        PROMETHEUS,
+        LOG
+    }
+
+    public enum MetricType {
+        COUNTER,
+        GAUGE,
+        HISTOGRAM
+    }
+
+    public static final IMetric build(
+            MetricMode mode,
+            MetricType type,
+            String name,
+            String unit,
+            List<String> dimensionNames) {
+        if (mode == MetricMode.PROMETHEUS) {
+            switch (type) {
+                case COUNTER:
+                    return new PrometheusCounter(type, name, unit, dimensionNames);
+                case GAUGE:
+                    return new PrometheusGauge(type, name, unit, dimensionNames);
+                case HISTOGRAM:
+                    return new PrometheusHistogram(type, name, unit, dimensionNames);
+                default:
+            }
+        } else {
+            return new LogMetric(type, name, unit, dimensionNames);
+        }
+        return null;
+    }
+
+    private MetricBuilder() {}
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java
new file mode 100644
index 0000000000..3ac81f3d54
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCache.java
@@ -0,0 +1,116 @@
+package org.pytorch.serve.metrics;
+
+import java.io.FileNotFoundException;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import org.pytorch.serve.metrics.configuration.MetricConfiguration;
+import org.pytorch.serve.metrics.configuration.MetricSpecification;
+import org.pytorch.serve.util.ConfigManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class MetricCache {
+    private static final Logger logger = LoggerFactory.getLogger(MetricCache.class);
+    private static MetricCache instance;
+    private MetricConfiguration config;
+    private ConcurrentMap<String, IMetric> metricsFrontend;
+    private ConcurrentMap<String, IMetric> metricsBackend;
+
+    private MetricCache() throws FileNotFoundException {
+        this.metricsFrontend = new ConcurrentHashMap<String, IMetric>();
+        this.metricsBackend = new ConcurrentHashMap<String, IMetric>();
+
+        String metricsConfigPath = ConfigManager.getInstance().getMetricsConfigPath();
+        try {
+            this.config = MetricConfiguration.loadConfiguration(metricsConfigPath);
+        } catch (FileNotFoundException | RuntimeException e) {
+            logger.error("Failed to load metrics configuration: ", e);
+            return;
+        }
+
+        MetricBuilder.MetricMode metricsMode = MetricBuilder.MetricMode.LOG;
+        String metricsConfigMode = ConfigManager.getInstance().getMetricsMode();
+        if (metricsConfigMode != null && metricsConfigMode.toLowerCase().contains("prometheus")) {
+            metricsMode = MetricBuilder.MetricMode.PROMETHEUS;
+        }
+
+        if (this.config.getTs_metrics() != null) {
+            addMetrics(
+                    this.metricsFrontend,
+                    this.config.getTs_metrics().getCounter(),
+                    metricsMode,
+                    MetricBuilder.MetricType.COUNTER);
+            addMetrics(
+                    this.metricsFrontend,
+                    this.config.getTs_metrics().getGauge(),
+                    metricsMode,
+                    MetricBuilder.MetricType.GAUGE);
+            addMetrics(
+                    this.metricsFrontend,
+                    this.config.getTs_metrics().getHistogram(),
+                    metricsMode,
+                    MetricBuilder.MetricType.HISTOGRAM);
+        }
+
+        if (this.config.getModel_metrics() != null) {
+            addMetrics(
+                    this.metricsBackend,
+                    this.config.getModel_metrics().getCounter(),
+                    metricsMode,
+                    MetricBuilder.MetricType.COUNTER);
+            addMetrics(
+                    this.metricsBackend,
+                    this.config.getModel_metrics().getGauge(),
+                    metricsMode,
+                    MetricBuilder.MetricType.GAUGE);
+            addMetrics(
+                    this.metricsBackend,
+                    this.config.getModel_metrics().getHistogram(),
+                    metricsMode,
+                    MetricBuilder.MetricType.HISTOGRAM);
+        }
+    }
+
+    private void addMetrics(
+            ConcurrentMap<String, IMetric> metricCache,
+            List<MetricSpecification> metricsSpec,
+            MetricBuilder.MetricMode metricMode,
+            MetricBuilder.MetricType metricType) {
+        if (metricsSpec == null) {
+            return;
+        }
+
+        for (MetricSpecification spec : metricsSpec) {
+            metricCache.put(
+                    spec.getName(),
+                    MetricBuilder.build(
+                            metricMode,
+                            metricType,
+                            spec.getName(),
+                            spec.getUnit(),
+                            spec.getDimensions()));
+        }
+    }
+
+    public static void init() throws FileNotFoundException {
+        if (instance != null) {
+            logger.error("Skip initializing metrics cache since it has already been initialized");
+            return;
+        }
+
+        instance = new MetricCache();
+    }
+
+    public static MetricCache getInstance() {
+        return instance;
+    }
+
+    public IMetric getMetricFrontend(String metricName) {
+        return metricsFrontend.get(metricName);
+    }
+
+    public IMetric getMetricBackend(String metricName) {
+        return metricsBackend.get(metricName);
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java
index f6a8cf1522..e83c809260 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java
@@ -12,25 +12,29 @@
 import org.apache.commons.io.IOUtils;
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.messages.EnvironmentUtils;
+import org.pytorch.serve.wlm.ModelVersionedRefs;
 import org.pytorch.serve.wlm.ModelManager;
 import org.pytorch.serve.wlm.WorkerThread;
+import org.pytorch.serve.wlm.Model;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class MetricCollector implements Runnable {
 
     private static final Logger logger = LoggerFactory.getLogger(MetricCollector.class);
-    private static final Logger loggerMetrics =
-            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
+    private final MetricCache metricCache;
     private ConfigManager configManager;
 
     public MetricCollector(ConfigManager configManager) {
         this.configManager = configManager;
+        this.metricCache = MetricCache.getInstance();
     }
 
     @Override
     public void run() {
         try {
+            // TODO Simon: there was previously something related to metrics here, we might add it back
+
             // Collect System level Metrics
             String[] args = new String[4];
             args[0] = configManager.getPythonExecutable();
@@ -40,7 +44,6 @@ public void run() {
             File workingDir = new File(configManager.getModelServerHome());
 
             String[] envp = EnvironmentUtils.getEnvString(workingDir.getAbsolutePath(), null, null);
-
             final Process p = Runtime.getRuntime().exec(args, envp, workingDir); // NOPMD
             ModelManager modelManager = ModelManager.getInstance();
             Map<Integer, WorkerThread> workerMap = modelManager.getWorkers();
@@ -79,7 +82,27 @@ public void run() {
                     if (metric == null) {
                         logger.warn("Parse metrics failed: " + line);
                     } else {
-                        loggerMetrics.info("{}", metric);
+                        if (this.metricCache.getMetricFrontend(metric.getMetricName()) != null) {
+                            try {
+                                List<String> dimensionValues = new ArrayList<String>();
+                                for (Dimension dimension : metric.getDimensions()) {
+                                    dimensionValues.add(dimension.getValue());
+                                }
+                                // Frontend metrics by default have the last dimension as Hostname
+                                dimensionValues.add(metric.getHostName());
+                                this.metricCache
+                                        .getMetricFrontend(metric.getMetricName())
+                                        .addOrUpdate(
+                                                dimensionValues,
+                                                Double.parseDouble(metric.getValue()));
+                            } catch (Exception e) {
+                                logger.error(
+                                        "Failed to update frontend metric ",
+                                        metric.getMetricName(),
+                                        ": ",
+                                        e);
+                            }
+                        }
                         metricsSystem.add(metric);
                     }
                 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/api/MetricAggregator.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/api/MetricAggregator.java
deleted file mode 100644
index 383942508f..0000000000
--- a/frontend/server/src/main/java/org/pytorch/serve/metrics/api/MetricAggregator.java
+++ /dev/null
@@ -1,28 +0,0 @@
-package org.pytorch.serve.metrics.api;
-
-import org.pytorch.serve.metrics.format.prometheous.PrometheusMetricManager;
-import org.pytorch.serve.util.ConfigManager;
-
-public final class MetricAggregator {
-
-    private MetricAggregator() {}
-
-    public static void handleInferenceMetric(final String modelName, final String modelVersion) {
-        ConfigManager configMgr = ConfigManager.getInstance();
-        if (configMgr.isMetricApiEnable()
-                && configMgr.getMetricsFormat().equals(ConfigManager.METRIC_FORMAT_PROMETHEUS)) {
-            PrometheusMetricManager.getInstance().incInferCount(modelName, modelVersion);
-        }
-    }
-
-    public static void handleInferenceMetric(
-            final String modelName, final String modelVersion, long timeInQueue, long inferTime) {
-        ConfigManager configMgr = ConfigManager.getInstance();
-        if (configMgr.isMetricApiEnable()
-                && configMgr.getMetricsFormat().equals(ConfigManager.METRIC_FORMAT_PROMETHEUS)) {
-            PrometheusMetricManager metrics = PrometheusMetricManager.getInstance();
-            metrics.incInferLatency(inferTime, modelName, modelVersion);
-            metrics.incQueueLatency(timeInQueue, modelName, modelVersion);
-        }
-    }
-}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricConfiguration.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricConfiguration.java
new file mode 100644
index 0000000000..cb41a0d907
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricConfiguration.java
@@ -0,0 +1,91 @@
+package org.pytorch.serve.metrics.configuration;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.util.List;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.yaml.snakeyaml.Yaml;
+import org.yaml.snakeyaml.composer.ComposerException;
+import org.yaml.snakeyaml.constructor.Constructor;
+
+public class MetricConfiguration {
+    private static final Logger logger = LoggerFactory.getLogger(MetricConfiguration.class);
+    private List<String> dimensions;
+
+    @SuppressWarnings("checkstyle:MemberName")
+    private MetricTypes ts_metrics;
+
+    @SuppressWarnings("checkstyle:MemberName")
+    private MetricTypes model_metrics;
+
+    public void setDimensions(List<String> dimensions) {
+        this.dimensions = dimensions;
+    }
+
+    public List<String> getDimensions() {
+        return this.dimensions;
+    }
+
+    @SuppressWarnings("checkstyle:MethodName")
+    public void setTs_metrics(MetricTypes tsMetrics) {
+        this.ts_metrics = tsMetrics;
+    }
+
+    @SuppressWarnings("checkstyle:MethodName")
+    public MetricTypes getTs_metrics() {
+        return this.ts_metrics;
+    }
+
+    @SuppressWarnings("checkstyle:MethodName")
+    public void setModel_metrics(MetricTypes modelMetrics) {
+        // The Hostname dimension is included by default for model metrics
+        modelMetrics.setCounter(this.addHostnameDimensionToMetrics(modelMetrics.getCounter()));
+        modelMetrics.setGauge(this.addHostnameDimensionToMetrics(modelMetrics.getGauge()));
+        modelMetrics.setHistogram(this.addHostnameDimensionToMetrics(modelMetrics.getHistogram()));
+        this.model_metrics = modelMetrics;
+    }
+
+    @SuppressWarnings("checkstyle:MethodName")
+    public MetricTypes getModel_metrics() {
+        return this.model_metrics;
+    }
+
+    public void validate() {
+        if (this.ts_metrics != null) {
+            ts_metrics.validate();
+        }
+
+        if (this.model_metrics != null) {
+            model_metrics.validate();
+        }
+    }
+
+    public static MetricConfiguration loadConfiguration(String configFilePath)
+            throws FileNotFoundException, ComposerException, RuntimeException {
+        Constructor constructor = new Constructor(MetricConfiguration.class);
+        Yaml yaml = new Yaml(constructor);
+        FileInputStream inputStream = new FileInputStream(new File(configFilePath));
+        MetricConfiguration config = yaml.load(inputStream);
+        config.validate();
+        logger.info("Successfully loaded metrics configuration from {}", configFilePath);
+
+        return config;
+    }
+
+    private List<MetricSpecification> addHostnameDimensionToMetrics(
+            List<MetricSpecification> metricsSpec) {
+        if (metricsSpec == null) {
+            return metricsSpec;
+        }
+
+        for (MetricSpecification spec : metricsSpec) {
+            List<String> dimensions = spec.getDimensions();
+            dimensions.add("Hostname");
+            spec.setDimensions(dimensions);
+        }
+
+        return metricsSpec;
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricSpecification.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricSpecification.java
new file mode 100644
index 0000000000..28a7bf3067
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricSpecification.java
@@ -0,0 +1,48 @@
+package org.pytorch.serve.metrics.configuration;
+
+import java.util.List;
+
+public class MetricSpecification {
+    private String name;
+    private String unit;
+    private List<String> dimensions;
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return this.name;
+    }
+
+    public void setUnit(String unit) {
+        this.unit = unit;
+    }
+
+    public String getUnit() {
+        return this.unit;
+    }
+
+    public void setDimensions(List<String> dimensions) {
+        this.dimensions = dimensions;
+    }
+
+    public List<String> getDimensions() {
+        return this.dimensions;
+    }
+
+    @Override
+    public String toString() {
+        return "name: " + this.name + ", unit: " + this.unit + ", dimensions: " + this.dimensions;
+    }
+
+    public void validate() {
+        if (this.name == null || this.name.isEmpty()) {
+            throw new RuntimeException("Metric name cannot be empty. " + this);
+        }
+
+        if (this.unit == null || this.unit.isEmpty()) {
+            throw new RuntimeException("Metric unit cannot be empty. " + this);
+        }
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricTypes.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricTypes.java
new file mode 100644
index 0000000000..82f1bc7533
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/configuration/MetricTypes.java
@@ -0,0 +1,53 @@
+package org.pytorch.serve.metrics.configuration;
+
+import java.util.List;
+
+public class MetricTypes {
+    private List<MetricSpecification> counter;
+    private List<MetricSpecification> gauge;
+    private List<MetricSpecification> histogram;
+
+    public void setCounter(List<MetricSpecification> counter) {
+        this.counter = counter;
+    }
+
+    public List<MetricSpecification> getCounter() {
+        return this.counter;
+    }
+
+    public void setGauge(List<MetricSpecification> gauge) {
+        this.gauge = gauge;
+    }
+
+    public List<MetricSpecification> getGauge() {
+        return this.gauge;
+    }
+
+    public void setHistogram(List<MetricSpecification> histogram) {
+        this.histogram = histogram;
+    }
+
+    public List<MetricSpecification> getHistogram() {
+        return this.histogram;
+    }
+
+    public void validate() {
+        if (this.counter != null) {
+            for (MetricSpecification spec : this.counter) {
+                spec.validate();
+            }
+        }
+
+        if (this.gauge != null) {
+            for (MetricSpecification spec : this.gauge) {
+                spec.validate();
+            }
+        }
+
+        if (this.histogram != null) {
+            for (MetricSpecification spec : this.histogram) {
+                spec.validate();
+            }
+        }
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusCounter.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusCounter.java
new file mode 100644
index 0000000000..fa23fb9dea
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusCounter.java
@@ -0,0 +1,32 @@
+package org.pytorch.serve.metrics.format.prometheous;
+
+import io.prometheus.client.Counter;
+import java.util.List;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricBuilder;
+
+public class PrometheusCounter extends IMetric {
+    private final Counter counter;
+
+    public PrometheusCounter(
+            MetricBuilder.MetricType type, String name, String unit, List<String> dimensionNames) {
+        super(type, name, unit, dimensionNames);
+        this.counter =
+                Counter.build()
+                        .name(this.name)
+                        .labelNames(
+                                this.dimensionNames.toArray(new String[this.dimensionNames.size()]))
+                        .help("Torchserve prometheus counter metric with unit: " + this.unit)
+                        .register();
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, double value) {
+        this.counter.labels(dimensionValues.toArray(new String[dimensionValues.size()])).inc(value);
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, String requestIds, double value) {
+        this.addOrUpdate(dimensionValues, value);
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusGauge.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusGauge.java
new file mode 100644
index 0000000000..17392e3fc1
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusGauge.java
@@ -0,0 +1,32 @@
+package org.pytorch.serve.metrics.format.prometheous;
+
+import io.prometheus.client.Gauge;
+import java.util.List;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricBuilder;
+
+public class PrometheusGauge extends IMetric {
+    private final Gauge gauge;
+
+    public PrometheusGauge(
+            MetricBuilder.MetricType type, String name, String unit, List<String> dimensionNames) {
+        super(type, name, unit, dimensionNames);
+        this.gauge =
+                Gauge.build()
+                        .name(this.name)
+                        .labelNames(
+                                this.dimensionNames.toArray(new String[this.dimensionNames.size()]))
+                        .help("Torchserve prometheus gauge metric with unit: " + this.unit)
+                        .register();
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, double value) {
+        this.gauge.labels(dimensionValues.toArray(new String[dimensionValues.size()])).set(value);
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, String requestIds, double value) {
+        this.addOrUpdate(dimensionValues, value);
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusHistogram.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusHistogram.java
new file mode 100644
index 0000000000..f66c2aeeb1
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusHistogram.java
@@ -0,0 +1,34 @@
+package org.pytorch.serve.metrics.format.prometheous;
+
+import io.prometheus.client.Histogram;
+import java.util.List;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricBuilder;
+
+public class PrometheusHistogram extends IMetric {
+    private final Histogram histogram;
+
+    public PrometheusHistogram(
+            MetricBuilder.MetricType type, String name, String unit, List<String> dimensionNames) {
+        super(type, name, unit, dimensionNames);
+        this.histogram =
+                Histogram.build()
+                        .name(this.name)
+                        .labelNames(
+                                this.dimensionNames.toArray(new String[this.dimensionNames.size()]))
+                        .help("Torchserve prometheus histogram metric with unit: " + this.unit)
+                        .register();
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, double value) {
+        this.histogram
+                .labels(dimensionValues.toArray(new String[dimensionValues.size()]))
+                .observe(value);
+    }
+
+    @Override
+    public void addOrUpdate(List<String> dimensionValues, String requestIds, double value) {
+        this.addOrUpdate(dimensionValues, value);
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusMetricManager.java b/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusMetricManager.java
deleted file mode 100644
index 27bc015420..0000000000
--- a/frontend/server/src/main/java/org/pytorch/serve/metrics/format/prometheous/PrometheusMetricManager.java
+++ /dev/null
@@ -1,81 +0,0 @@
-package org.pytorch.serve.metrics.format.prometheous;
-
-import io.prometheus.client.Counter;
-import java.util.UUID;
-
-public final class PrometheusMetricManager {
-
-    private static final PrometheusMetricManager METRIC_MANAGER = new PrometheusMetricManager();
-    private static final String METRICS_UUID = UUID.randomUUID().toString();
-    private Counter inferRequestCount;
-    private Counter inferLatency;
-    private Counter queueLatency;
-
-    private PrometheusMetricManager() {
-        String[] metricsLabels = {"uuid", "model_name", "model_version"};
-        inferRequestCount =
-                Counter.build()
-                        .name("ts_inference_requests_total")
-                        .labelNames(metricsLabels)
-                        .help("Total number of inference requests.")
-                        .register();
-        inferLatency =
-                Counter.build()
-                        .name("ts_inference_latency_microseconds")
-                        .labelNames(metricsLabels)
-                        .help("Cumulative inference duration in microseconds")
-                        .register();
-        queueLatency =
-                Counter.build()
-                        .name("ts_queue_latency_microseconds")
-                        .labelNames(metricsLabels)
-                        .help("Cumulative queue duration in microseconds")
-                        .register();
-    }
-
-    private static String getOrDefaultModelVersion(String modelVersion) {
-        return modelVersion == null ? "default" : modelVersion;
-    }
-
-    public static PrometheusMetricManager getInstance() {
-        return METRIC_MANAGER;
-    }
-
-    /**
-     * Counts the time in ns it took for an inference to be completed
-     *
-     * @param inferTime time in nanoseconds
-     * @param modelName name of the model
-     * @param modelVersion version of the model
-     */
-    public void incInferLatency(long inferTime, String modelName, String modelVersion) {
-        inferLatency
-                .labels(METRICS_UUID, modelName, getOrDefaultModelVersion(modelVersion))
-                .inc(inferTime / 1000.0);
-    }
-
-    /**
-     * Counts the time in ns an inference request was queued before being executed
-     *
-     * @param queueTime time in nanoseconds
-     * @param modelName name of the model
-     * @param modelVersion version of the model
-     */
-    public void incQueueLatency(long queueTime, String modelName, String modelVersion) {
-        queueLatency
-                .labels(METRICS_UUID, modelName, getOrDefaultModelVersion(modelVersion))
-                .inc(queueTime / 1000.0);
-    }
-
-    /**
-     * Counts a valid inference request to be processed
-     *
-     * @param modelName name of the model
-     * @param modelVersion version of the model
-     */
-    public void incInferCount(String modelName, String modelVersion) {
-        inferRequestCount
-                .labels(METRICS_UUID, modelName, getOrDefaultModelVersion(modelVersion))
-                .inc();
-    }
-}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/snapshot/SnapshotManager.java b/frontend/server/src/main/java/org/pytorch/serve/snapshot/SnapshotManager.java
index c6d910d59d..6d0aca353f 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/snapshot/SnapshotManager.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/snapshot/SnapshotManager.java
@@ -17,6 +17,7 @@
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -139,7 +140,10 @@ private void initModels(Snapshot snapshot) {
 
         } catch (IOException e) {
             logger.error("Error while retrieving snapshot details. Details: {}", e.getMessage());
-        } catch (ModelException | InterruptedException | DownloadArchiveException e) {
+        } catch (ModelException
+                | InterruptedException
+                | DownloadArchiveException
+                | WorkerInitializationException e) {
             logger.error("Error while registering model. Details: {}", e.getMessage());
         }
     }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java
index c7adbca786..89dc316613 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java
@@ -35,6 +35,7 @@
 import org.pytorch.serve.wlm.Model;
 import org.pytorch.serve.wlm.ModelManager;
 import org.pytorch.serve.wlm.ModelVersionedRefs;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.pytorch.serve.wlm.WorkerState;
 import org.pytorch.serve.wlm.WorkerThread;
 
@@ -108,7 +109,7 @@ public static String setDefault(String modelName, String newModelVersion)
 
     public static StatusResponse registerModel(RegisterModelRequest registerModelRequest)
             throws ModelException, InternalServerException, ExecutionException,
-                    InterruptedException, DownloadArchiveException {
+                    InterruptedException, DownloadArchiveException, WorkerInitializationException {
         String modelUrl = registerModelRequest.getModelUrl();
         if (modelUrl == null) {
             throw new BadRequestException("Parameter url is required.");
@@ -162,7 +163,7 @@ public static StatusResponse handleRegister(
             boolean isWorkflowModel,
             boolean s3SseKms)
             throws ModelException, ExecutionException, InterruptedException,
-                    DownloadArchiveException {
+                    DownloadArchiveException, WorkerInitializationException {
 
         ModelManager modelManager = ModelManager.getInstance();
         final ModelArchive archive;
@@ -188,7 +189,17 @@ public static StatusResponse handleRegister(
         }
 
         modelName = archive.getModelName();
-        if (initialWorkers <= 0) {
+        int minWorkers = 0;
+        int maxWorkers = 0;
+        if (archive.getModelConfig() != null) {
+            int marMinWorkers = archive.getModelConfig().getMinWorkers();
+            int marMaxWorkers = archive.getModelConfig().getMaxWorkers();
+            if (marMinWorkers > 0 && marMaxWorkers >= marMinWorkers) {
+                minWorkers = marMinWorkers;
+                maxWorkers = marMaxWorkers;
+            }
+        }
+        if (initialWorkers <= 0 && minWorkers == 0) {
             final String msg =
                     "Model \""
                             + modelName
@@ -200,12 +211,14 @@ public static StatusResponse handleRegister(
             }
             return new StatusResponse(msg, HttpURLConnection.HTTP_OK);
         }
+        minWorkers = minWorkers > 0 ? minWorkers : initialWorkers;
+        maxWorkers = maxWorkers > 0 ? maxWorkers : initialWorkers;
 
         return ApiUtils.updateModelWorkers(
                 modelName,
                 archive.getModelVersion(),
-                initialWorkers,
-                initialWorkers,
+                minWorkers,
+                maxWorkers,
                 isSync,
                 true,
                 f -> {
@@ -223,7 +236,7 @@ public static StatusResponse updateModelWorkers(
             boolean isInit,
             final Function<Void, Void> onError)
             throws ModelVersionNotFoundException, ModelNotFoundException, ExecutionException,
-                    InterruptedException {
+                    InterruptedException, WorkerInitializationException {
 
         ModelManager modelManager = ModelManager.getInstance();
         if (maxWorkers < minWorkers) {
@@ -359,6 +372,23 @@ public static String getWorkerStatus() {
         return response;
     }
 
+    public static boolean isModelHealthy() {
+        ModelManager modelManager = ModelManager.getInstance();
+        int numHealthy = 0;
+        int numScaled = 0;
+
+        for (Map.Entry<String, ModelVersionedRefs> m : modelManager.getAllModels()) {
+            numScaled = m.getValue().getDefaultModel().getMinWorkers();
+            numHealthy =
+                    modelManager.getNumHealthyWorkers(
+                            m.getValue().getDefaultModel().getModelVersionName());
+            if (numHealthy < numScaled) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     private static DescribeModelResponse createModelResponse(
             ModelManager modelManager, String modelName, Model model) {
         DescribeModelResponse resp = new DescribeModelResponse();
@@ -394,22 +424,25 @@ public static RestJob addRESTInferenceJob(
             throws ModelNotFoundException, ModelVersionNotFoundException {
         RestJob job = new RestJob(ctx, modelName, version, WorkerCommands.PREDICT, input);
         if (!ModelManager.getInstance().addJob(job)) {
-            String responseMessage = getInferenceErrorResponseMessage(modelName, version);
+            String priority = job.getPriority().toString();
+            String responseMessage = getInferenceErrorResponseMessage(modelName, version, priority);
             throw new ServiceUnavailableException(responseMessage);
         }
         return job;
     }
 
     @SuppressWarnings("PMD")
-    public static String getInferenceErrorResponseMessage(String modelName, String modelVersion) {
-        String responseMessage = "Model \"" + modelName;
+    public static String getInferenceErrorResponseMessage(String modelName, String modelVersion, String jobPriority) {
+        String responseMessage = "Model: " + modelName + "\n";
 
         if (modelVersion != null) {
-            responseMessage += "\" Version " + modelVersion;
+            responseMessage += "Version: " + modelVersion + "\n";
         }
 
+        responseMessage += "Priority: " + jobPriority + "\n";
+
         responseMessage +=
-                "\" has no worker to serve inference request. Please use scale workers API to add workers.";
+                "Reason: queue full";
         return responseMessage;
     }
 
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
index 70a21df416..6a7499959d 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
@@ -64,8 +64,13 @@ public final class ConfigManager {
     private static final String TS_NUMBER_OF_NETTY_THREADS = "number_of_netty_threads";
     private static final String TS_NETTY_CLIENT_THREADS = "netty_client_threads";
     private static final String TS_JOB_QUEUE_SIZE = "job_queue_size";
+    private static final String TS_HIGH_PRIORITY_PROBABILITY = "high_prio_prob";
     private static final String TS_NUMBER_OF_GPU = "number_of_gpu";
+    private static final String TS_MIN_FREE_GPU_MEMORY = "min_free_gpu_memory";
+    private static final String TS_MAX_SHARE_GPU_FAILURES = "max_share_gpu_failures";
     private static final String TS_METRICS_CONFIG = "metrics_config";
+    private static final String TS_METRICS_MODE = "metrics_mode";
+    private static final String TS_DISABLE_SYSTEM_METRICS = "disable_system_metrics";
 
     // IPEX config option that can be set at config.properties
     private static final String TS_IPEX_ENABLE = "ipex_enable";
@@ -92,12 +97,12 @@ public final class ConfigManager {
     private static final String TS_PREFER_DIRECT_BUFFER = "prefer_direct_buffer";
     private static final String TS_ALLOWED_URLS = "allowed_urls";
     private static final String TS_INSTALL_PY_DEP_PER_MODEL = "install_py_dep_per_model";
-    private static final String TS_METRICS_FORMAT = "metrics_format";
     private static final String TS_ENABLE_METRICS_API = "enable_metrics_api";
     private static final String TS_GRPC_INFERENCE_PORT = "grpc_inference_port";
     private static final String TS_GRPC_MANAGEMENT_PORT = "grpc_management_port";
     private static final String TS_ENABLE_GRPC_SSL = "enable_grpc_ssl";
     private static final String TS_INITIAL_WORKER_PORT = "initial_worker_port";
+    private static final String TS_INITIAL_DISTRIBUTION_PORT = "initial_distribution_port";
     private static final String TS_WORKFLOW_STORE = "workflow_store";
 
     // Configuration which are not documented or enabled through environment variables
@@ -133,6 +138,7 @@ public final class ConfigManager {
     private static ConfigManager instance;
     private String hostName;
     private Map<String, Map<String, JsonObject>> modelConfig = new HashMap<>();
+    private String torchrunLogDir;
 
     private ConfigManager(Arguments args) throws IOException {
         prop = new Properties();
@@ -333,10 +339,6 @@ public boolean getInstallPyDepPerModel() {
         return Boolean.parseBoolean(getProperty(TS_INSTALL_PY_DEP_PER_MODEL, "false"));
     }
 
-    public String getMetricsFormat() {
-        return getProperty(TS_METRICS_FORMAT, METRIC_FORMAT_PROMETHEUS);
-    }
-
     public boolean isMetricApiEnable() {
         return Boolean.parseBoolean(getProperty(TS_ENABLE_METRICS_API, "true"));
     }
@@ -361,10 +363,27 @@ public int getJobQueueSize() {
         return getIntProperty(TS_JOB_QUEUE_SIZE, 100);
     }
 
+    public float getHighPrioProb() throws IllegalArgumentException {
+        float highPrioProb = getFloatProperty(TS_HIGH_PRIORITY_PROBABILITY, 0.67f);
+        if (highPrioProb < 0.00f || highPrioProb > 1.00f){
+            throw new IllegalArgumentException("highPrioProb " + String.valueOf(highPrioProb) + 
+                " is not a valid probability!");
+        }
+        return highPrioProb;
+    }
+
     public int getNumberOfGpu() {
         return getIntProperty(TS_NUMBER_OF_GPU, 0);
     }
 
+    public int getMinFreeGpuMemory() {
+        return getIntProperty(TS_MIN_FREE_GPU_MEMORY, 4096);
+    }
+
+    public float getMaxShareGpuFailures() {
+        return getFloatProperty(TS_MAX_SHARE_GPU_FAILURES, 0.90f);
+    }
+
     public String getMetricsConfigPath() {
         String path = getCanonicalPath(prop.getProperty(TS_METRICS_CONFIG));
         if (path == null) {
@@ -373,6 +392,25 @@ public String getMetricsConfigPath() {
         return path;
     }
 
+    public String getTorchRunLogDir() {
+        if (torchrunLogDir == null) {
+            torchrunLogDir =
+                    Paths.get(
+                                    getCanonicalPath(System.getProperty("LOG_LOCATION")),
+                                    "torchelastic_ts")
+                            .toString();
+        }
+        return torchrunLogDir;
+    }
+
+    public String getMetricsMode() {
+        return getProperty(TS_METRICS_MODE, "log");
+    }
+
+    public boolean isSystemMetricsDisabled() {
+        return Boolean.parseBoolean(getProperty(TS_DISABLE_SYSTEM_METRICS, "false"));
+    }
+
     public String getTsDefaultServiceHandler() {
         return getProperty(TS_DEFAULT_SERVICE_HANDLER, null);
     }
@@ -638,10 +676,12 @@ public String dumpConfigurations() {
                 + getAllowedUrls()
                 + "\nCustom python dependency for model allowed: "
                 + prop.getProperty(TS_INSTALL_PY_DEP_PER_MODEL, "false")
-                + "\nMetrics report format: "
-                + prop.getProperty(TS_METRICS_FORMAT, METRIC_FORMAT_PROMETHEUS)
                 + "\nEnable metrics API: "
                 + prop.getProperty(TS_ENABLE_METRICS_API, "true")
+                + "\nMetrics mode: "
+                + getMetricsMode()
+                + "\nDisable system metrics: "
+                + isSystemMetricsDisabled()
                 + "\nWorkflow Store: "
                 + (getWorkflowStore() == null ? "N/A" : getWorkflowStore())
                 + "\nModel config: "
@@ -680,6 +720,14 @@ private int getIntProperty(String key, int def) {
         return Integer.parseInt(value);
     }
 
+    private float getFloatProperty(String key, float def) {
+        String value = prop.getProperty(key);
+        if (value == null) {
+            return def;
+        }
+        return Float.parseFloat(value);
+    }
+
     public int getDefaultResponseTimeout() {
         return Integer.parseInt(prop.getProperty(TS_DEFAULT_RESPONSE_TIMEOUT, "120"));
     }
@@ -803,6 +851,14 @@ public void setInitialWorkerPort(int initialPort) {
         prop.setProperty(TS_INITIAL_WORKER_PORT, String.valueOf(initialPort));
     }
 
+    public int getInitialDistributionPort() {
+        return Integer.parseInt(prop.getProperty(TS_INITIAL_DISTRIBUTION_PORT, "29500"));
+    }
+
+    public void setInitialDistributionPort(int initialPort) {
+        prop.setProperty(TS_INITIAL_DISTRIBUTION_PORT, String.valueOf(initialPort));
+    }
+
     private void setModelConfig() {
         String modelConfigStr = prop.getProperty(MODEL_CONFIG, null);
         Type type = new TypeToken<Map<String, Map<String, JsonObject>>>() {}.getType();
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/GPUManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/GPUManager.java
new file mode 100644
index 0000000000..410a34bdcc
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/GPUManager.java
@@ -0,0 +1,179 @@
+package org.pytorch.serve.util;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.TreeMap;
+import java.util.Iterator;
+import java.util.ArrayDeque;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.nio.charset.StandardCharsets;
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class GPUManager {
+
+    private static final Logger logger = LoggerFactory.getLogger(GPUManager.class);
+    private static final int nFailureHistory = 100;
+
+    private static GPUManager instance;
+
+    private final int nGPUs;
+    private final int minFreeMemory;
+    private final float maxShareFailures;
+
+    private AtomicInteger[] freeMemory;
+    private HashMap<String, Integer> workerIds;
+    private ArrayDeque<Integer> gpuFailureHistory;
+
+    private GPUManager(int nGPUs, int minFreeMemory, float maxShareFailures) {
+        this.nGPUs = nGPUs;
+        this.minFreeMemory = minFreeMemory;
+        this.maxShareFailures = maxShareFailures;
+
+        this.gpuFailureHistory = new ArrayDeque<> ();
+        this.workerIds = new HashMap<> ();
+
+        if (nGPUs > 0) {
+            this.freeMemory = new AtomicInteger[this.nGPUs];
+            for (int i = 0; i < this.nGPUs; i++) {
+                this.freeMemory[i] = new AtomicInteger(-1);
+            }
+        }
+    }
+
+    // code largely copied from WorkerThread::getGpuUsage
+    private int queryNvidiaSmiFreeMemory(int gpuId) {
+        Process process;
+        try {
+            process =
+                    Runtime.getRuntime()
+                            .exec(
+                                    "nvidia-smi -i "
+                                            + gpuId
+                                            + " --query-gpu=memory.free --format=csv,noheader,nounits");
+            process.waitFor();
+            int exitCode = process.exitValue();
+            if (exitCode != 0) {
+                InputStream error = process.getErrorStream();
+                for (int i = 0; i < error.available(); i++) {
+                    logger.error("" + error.read());
+                }
+                return -1;
+            }
+            InputStream stdout = process.getInputStream();
+            BufferedReader reader =
+                    new BufferedReader(new InputStreamReader(stdout, StandardCharsets.UTF_8));
+            String line = reader.readLine();
+            if (line == null) {
+                return -1;
+            } else {
+                return Integer.parseInt(line);
+            }
+        } catch (Exception e) {
+            logger.error("An exception occurred when querying for free gpu memory", e);
+        }
+
+        return -1;
+    }
+
+    public static synchronized void init(ConfigManager configManager) {
+        int nGPUs = configManager.getNumberOfGpu();
+        int minFreeMemory = configManager.getMinFreeGpuMemory();
+        float maxShareFailures = configManager.getMaxShareGpuFailures();
+        instance = new GPUManager(nGPUs, minFreeMemory, maxShareFailures);
+    }
+
+    public static synchronized GPUManager getInstance() {
+        return instance;
+    }
+
+    public synchronized int getGPU(String workerId) {
+        // return -1 if there are no gpus
+        if (this.nGPUs == 0) {
+            return -1;
+        }
+        int failedGpuId;
+        // if the worker was previously assigned to a GPU and now requests a new one, it has likely failed
+        // add failed gpu id to failure history, removing old entries to make space if necessary
+        if (this.workerIds.containsKey(workerId)) {
+            failedGpuId = this.workerIds.get(workerId);
+            while (this.gpuFailureHistory.size() > nFailureHistory - 1) {
+                this.gpuFailureHistory.removeFirst();
+            }
+            this.gpuFailureHistory.addLast(failedGpuId);
+        }
+        // get free memory per GPU
+        for (int i = 0; i < this.nGPUs; i++) {
+            this.freeMemory[i].set(queryNvidiaSmiFreeMemory(i));
+        }
+        // get failures for share calculation
+        int[] nFailures = new int[this.nGPUs];
+        for (Iterator<Integer> iter = this.gpuFailureHistory.iterator(); iter.hasNext();) {
+            failedGpuId = iter.next();
+            nFailures[failedGpuId]++;
+        }
+        // get free memory for all eligible GPUs
+        HashMap<Integer, Integer> eligibleIdFreeMems = new HashMap<Integer, Integer> ();
+        for (int i = 0; i < this.nGPUs; i++) {
+            // check that free memory is available and exceeds minimum
+            if (this.freeMemory[i].intValue() > this.minFreeMemory) {
+                if (this.gpuFailureHistory.size() > 1) {
+                    // check that share of failures is smaller than maximum
+                    float shareFailures = (float) nFailures[i] / (float) this.gpuFailureHistory.size();
+                    if (shareFailures < this.maxShareFailures) {
+                        eligibleIdFreeMems.put(i, this.freeMemory[i].intValue());
+                    } else {
+                        logger.warn("GPU ID {} deemed ineligible since it accounts for {} out of failures {}", i, nFailures[i], this.maxShareFailures);
+                    }
+                } else {
+                    eligibleIdFreeMems.put(i, this.freeMemory[i].intValue());
+                }
+                logger.info("eligibleIdFreeMems[{}] {}", i, this.freeMemory[i].intValue());
+
+            }
+        }
+        logger.info("eligibleIdFreeMems.size() {}", eligibleIdFreeMems.size());
+        // fork on number of eligible GPUs
+        int gpuId = -1;
+        if (eligibleIdFreeMems.size() == 0) {
+            logger.error("No eligible GPUs available, falling back to CPU");
+            return gpuId;
+        }
+        if (eligibleIdFreeMems.size() == 1) {
+            gpuId = eligibleIdFreeMems.keySet().iterator().next();
+        } else {
+            // get sum of eligible id free memory for prob calculation
+            int eligibleIdFreeMemSum = 0;
+            for (Map.Entry<Integer, Integer> entry : eligibleIdFreeMems.entrySet()) {
+                eligibleIdFreeMemSum += entry.getValue();
+            }
+            logger.info("eligibleIdFreeMemSum {}", eligibleIdFreeMemSum);
+            // store cumulative probabilities in navigable map
+            float cumProb = 0.0f;
+            TreeMap<Float, Integer> cumProbIds = new TreeMap<Float, Integer> ();
+            for (Map.Entry<Integer, Integer> entry : eligibleIdFreeMems.entrySet()) {
+                int i = entry.getKey();
+                int freeMem = entry.getValue();
+                cumProb += (float) freeMem / (float) eligibleIdFreeMemSum;
+                cumProbIds.put(cumProb, i);
+                // TODO Simon: This log should maybe have been removed during rebase
+                logger.info("cumProbIds[{}] {} because of freeMem {}", cumProb, i, freeMem);
+            }
+            // make random selection
+            float randFloat = ThreadLocalRandom.current().nextFloat();
+            logger.info("randFloat {}", randFloat);
+            gpuId = cumProbIds.ceilingEntry(randFloat).getValue();
+            logger.info("gpuId {}", gpuId);
+        }
+        logger.info("Assigning gpuId " + gpuId + 
+                    " with free memory " + eligibleIdFreeMems.get(gpuId) + 
+                    " with number of failures " + nFailures[gpuId] + 
+                    " to workerId " + workerId);
+        return gpuId;
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/NettyUtils.java b/frontend/server/src/main/java/org/pytorch/serve/util/NettyUtils.java
index 31d873d02d..53b3cfcd62 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/NettyUtils.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/NettyUtils.java
@@ -12,6 +12,7 @@
 import io.netty.handler.codec.http.HttpHeaderValues;
 import io.netty.handler.codec.http.HttpHeaders;
 import io.netty.handler.codec.http.HttpRequest;
+import io.netty.handler.codec.http.HttpResponse;
 import io.netty.handler.codec.http.HttpResponseStatus;
 import io.netty.handler.codec.http.HttpUtil;
 import io.netty.handler.codec.http.HttpVersion;
@@ -23,12 +24,13 @@
 import io.netty.util.CharsetUtil;
 import java.io.IOException;
 import java.net.SocketAddress;
+import java.util.Arrays;
 import java.util.List;
 import org.pytorch.serve.http.ErrorResponse;
 import org.pytorch.serve.http.Session;
 import org.pytorch.serve.http.StatusResponse;
-import org.pytorch.serve.metrics.Dimension;
-import org.pytorch.serve.metrics.Metric;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.messages.InputParameter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -40,31 +42,6 @@ public final class NettyUtils {
 
     private static final String REQUEST_ID = "x-request-id";
     private static final AttributeKey<Session> SESSION_KEY = AttributeKey.valueOf("session");
-    private static final Dimension DIMENSION = new Dimension("Level", "Host");
-    private static final Metric REQUESTS_2_XX =
-            new Metric(
-                    "Requests2XX",
-                    "1",
-                    "Count",
-                    ConfigManager.getInstance().getHostName(),
-                    DIMENSION);
-    private static final Metric REQUESTS_4_XX =
-            new Metric(
-                    "Requests4XX",
-                    "1",
-                    "Count",
-                    ConfigManager.getInstance().getHostName(),
-                    DIMENSION);
-    private static final Metric REQUESTS_5_XX =
-            new Metric(
-                    "Requests5XX",
-                    "1",
-                    "Count",
-                    ConfigManager.getInstance().getHostName(),
-                    DIMENSION);
-
-    private static final Logger loggerTsMetrics =
-            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
 
     private NettyUtils() {}
 
@@ -142,7 +119,7 @@ public static void sendError(
      * @param keepAlive if keep the connection
      */
     public static void sendHttpResponse(
-            ChannelHandlerContext ctx, FullHttpResponse resp, boolean keepAlive) {
+            ChannelHandlerContext ctx, HttpResponse resp, boolean keepAlive) {
         // Send the response and close the connection if necessary.
         Channel channel = ctx.channel();
         Session session = channel.attr(SESSION_KEY).getAndSet(null);
@@ -156,12 +133,35 @@ public static void sendHttpResponse(
             logger.info(session.toString());
         }
         int code = resp.status().code();
+        List<String> requestsMetricDimensionValues =
+                Arrays.asList("Host", ConfigManager.getInstance().getHostName());
         if (code >= 200 && code < 300) {
-            loggerTsMetrics.info("{}", REQUESTS_2_XX);
+            IMetric requests2xxMetric = MetricCache.getInstance().getMetricFrontend("Requests2XX");
+            if (requests2xxMetric != null) {
+                try {
+                    requests2xxMetric.addOrUpdate(requestsMetricDimensionValues, 1);
+                } catch (Exception e) {
+                    logger.error("Failed to update frontend metric Requests2XX: ", e);
+                }
+            }
         } else if (code >= 400 && code < 500) {
-            loggerTsMetrics.info("{}", REQUESTS_4_XX);
+            IMetric requests4xxMetric = MetricCache.getInstance().getMetricFrontend("Requests4XX");
+            if (requests4xxMetric != null) {
+                try {
+                    requests4xxMetric.addOrUpdate(requestsMetricDimensionValues, 1);
+                } catch (Exception e) {
+                    logger.error("Failed to update frontend metric Requests4XX: ", e);
+                }
+            }
         } else {
-            loggerTsMetrics.info("{}", REQUESTS_5_XX);
+            IMetric requests5xxMetric = MetricCache.getInstance().getMetricFrontend("Requests5XX");
+            if (requests5xxMetric != null) {
+                try {
+                    requests5xxMetric.addOrUpdate(requestsMetricDimensionValues, 1);
+                } catch (Exception e) {
+                    logger.error("Failed to update frontend metric Requests5XX: ", e);
+                }
+            }
         }
 
         String allowedOrigin = configManager.getCorsAllowedOrigin();
@@ -189,7 +189,11 @@ public static void sendHttpResponse(
         headers.set("Cache-Control", "no-cache; no-store, must-revalidate, private");
         headers.set("Expires", "Thu, 01 Jan 1970 00:00:00 UTC");
 
-        HttpUtil.setContentLength(resp, resp.content().readableBytes());
+        if (resp instanceof FullHttpResponse) {
+            HttpUtil.setContentLength(resp, ((FullHttpResponse) resp).content().readableBytes());
+        } else {
+            HttpUtil.setTransferEncodingChunked(resp, true);
+        }
         if (!keepAlive || code >= 400) {
             headers.set(HttpHeaderNames.CONNECTION, HttpHeaderValues.CLOSE);
             ChannelFuture f = channel.writeAndFlush(resp);
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/Prioritisable.java b/frontend/server/src/main/java/org/pytorch/serve/util/Prioritisable.java
new file mode 100644
index 0000000000..752d64f9d6
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/Prioritisable.java
@@ -0,0 +1,8 @@
+package org.pytorch.serve.util;
+
+public interface Prioritisable {
+
+    public Priority getPriority();
+    public void setPriority(Priority priority);
+
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/Priority.java b/frontend/server/src/main/java/org/pytorch/serve/util/Priority.java
new file mode 100644
index 0000000000..ce065431a8
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/Priority.java
@@ -0,0 +1,5 @@
+package org.pytorch.serve.util;
+
+public enum Priority {
+    LOW, HIGH, MAX
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/PriorityLinkedBlockingDeque.java b/frontend/server/src/main/java/org/pytorch/serve/util/PriorityLinkedBlockingDeque.java
new file mode 100644
index 0000000000..cdf58bc1b3
--- /dev/null
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/PriorityLinkedBlockingDeque.java
@@ -0,0 +1,145 @@
+package org.pytorch.serve.util;
+
+import java.util.concurrent.locks.ReentrantLock;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.LinkedBlockingDeque;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.Enumeration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class PriorityLinkedBlockingDeque<T extends Prioritisable> {
+
+    private static final Logger logger = LoggerFactory.getLogger(PriorityLinkedBlockingDeque.class);
+
+    // lock and condition for waiting on empty queues
+    final ReentrantLock lock = new ReentrantLock();
+    private final Condition notEmpty = lock.newCondition();
+
+    private final int queueSize;
+    private final float highPrioProb;
+    private final ConcurrentHashMap<Priority, LinkedBlockingDeque<T>> priorityDeques;
+
+    public PriorityLinkedBlockingDeque(int queueSize, float highPrioProb) {
+
+        this.queueSize = queueSize;
+        this.highPrioProb = highPrioProb;
+        this.priorityDeques = new ConcurrentHashMap<Priority, LinkedBlockingDeque<T>>();
+
+        // initialize priority deques
+        for (Priority priority : Priority.values()) { 
+            this.priorityDeques.put(priority, new LinkedBlockingDeque<T>(queueSize));
+        }
+    }
+
+    private LinkedBlockingDeque<T> getDequeForExtraction() {
+
+        // always select deque max first if non-empty
+        if (!this.priorityDeques.get(Priority.MAX).isEmpty()) {
+            return this.priorityDeques.get(Priority.MAX);
+        }
+
+        boolean highNonEmpty = !this.priorityDeques.get(Priority.HIGH).isEmpty();
+
+        // if both high and low are non-empty, make random selection
+        if (highNonEmpty && !this.priorityDeques.get(Priority.LOW).isEmpty()) {
+            if (ThreadLocalRandom.current().nextFloat() < this.highPrioProb) {
+                return this.priorityDeques.get(Priority.HIGH);
+            } else {
+                return this.priorityDeques.get(Priority.LOW);
+            }
+        // if only high is non-empty, return high
+        } else if (highNonEmpty) {
+            return this.priorityDeques.get(Priority.HIGH);
+        }
+
+        // if both empty or only low non-empty, return low
+        return this.priorityDeques.get(Priority.LOW);
+
+    }
+
+    private LinkedBlockingDeque<T> getDequeForInsertion(T p) {
+        Priority priority = p.getPriority();
+        LinkedBlockingDeque<T> dequeForInsertion = this.priorityDeques.get(priority);
+        return dequeForInsertion;
+    }
+
+    /* 
+    ideally, we would want to forward this to getDequeForExtraction().unlinkFirst(), but it is private
+    pollFirst() is a public method that forwards unlinkFirst(), so it's the next best alternative
+    reference: https://github.com/openjdk/jdk17/blob/master/src/java.base/share/classes/java/util/concurrent/LinkedBlockingDeque.java
+    */
+    private T unlinkFirst() {
+        return getDequeForExtraction().pollFirst();
+    }
+
+    public boolean isEmpty() {
+        // return true iff all deques are empty
+        return this.priorityDeques.reduceValues(Long.MAX_VALUE, LinkedBlockingDeque::isEmpty, Boolean::logicalAnd);
+    }
+
+    public boolean offer(T p) {
+        final ReentrantLock lock = this.lock;
+        lock.lock();
+        try {
+            boolean itemInserted = getDequeForInsertion(p).offer(p);
+            if (itemInserted) {
+                // awaken one worker that is waiting for notEmpty condition
+                notEmpty.signal();
+            }
+            return itemInserted;
+        } finally {
+            lock.unlock();
+        }
+    }
+
+    public void addFirst(T p) {
+        final ReentrantLock lock = this.lock;
+        lock.lock();
+        try {
+            getDequeForInsertion(p).addFirst(p);
+            // awaken one worker that is waiting for notEmpty condition
+            notEmpty.signal();
+        } finally {
+            lock.unlock();
+        }
+    }
+
+    /*
+    this is exactly the same as the equivalent method in LinkedBlockingDeque, the difference is in the implementation of unlinkFirst()
+    reference: https://github.com/openjdk/jdk17/blob/master/src/java.base/share/classes/java/util/concurrent/LinkedBlockingDeque.java
+    */
+    public T poll(long timeout, TimeUnit unit) throws InterruptedException {
+        long nanos = unit.toNanos(timeout);
+        final ReentrantLock lock = this.lock;
+        lock.lockInterruptibly();
+        try {
+            T x;
+            while ( (x = unlinkFirst()) == null) {
+                if (nanos <= 0L) {
+                    return null;
+                }
+                // waits until notEmpty condition is signalled
+                nanos = notEmpty.awaitNanos(nanos);
+            }
+            return x;
+        } finally {
+            lock.unlock();
+        }
+    }
+
+    public String getQueueStatusString() {
+        String response = "";
+        for (Priority priority : Priority.values()) {
+            int currentQueueStatus = this.priorityDeques.get(priority).size();
+            response = response + priority.toString() + "=" + String.valueOf(currentQueueStatus) + ",";
+        }
+        response = response + "queueSize=" + String.valueOf(this.queueSize);
+        return response;
+
+    }
+}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/BaseModelRequest.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/BaseModelRequest.java
index a567d10818..3d3e57095c 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/messages/BaseModelRequest.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/messages/BaseModelRequest.java
@@ -16,6 +16,10 @@ public WorkerCommands getCommand() {
         return command;
     }
 
+    public void setCommand(WorkerCommands workerCommands) {
+        this.command = workerCommands;
+    }
+
     public String getModelName() {
         return modelName;
     }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
index 8c4d34904f..af5dc0f54a 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
@@ -7,15 +7,18 @@
 import java.util.Map;
 
 public class RequestInput {
+    public static final String TS_STREAM_NEXT = "ts_stream_next";
 
     private String requestId;
     private Map<String, String> headers;
     private List<InputParameter> parameters;
+    private long clientExpireTS;
 
     public RequestInput(String requestId) {
         this.requestId = requestId;
         headers = new HashMap<>();
         parameters = new ArrayList<>();
+        clientExpireTS = Long.MAX_VALUE; // default(never expire): Long.MAX_VALUE
     }
 
     public String getRequestId() {
@@ -58,4 +61,14 @@ public String getStringParameter(String key) {
         }
         return null;
     }
+
+    public long getClientExpireTS() {
+        return clientExpireTS;
+    }
+
+    public void setClientExpireTS(long clientTimeoutInMills) {
+        if (clientTimeoutInMills > 0) {
+            this.clientExpireTS = System.currentTimeMillis() + clientTimeoutInMills;
+        }
+    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/WorkerCommands.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/WorkerCommands.java
index ceffd366c7..69ba0c80fe 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/messages/WorkerCommands.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/messages/WorkerCommands.java
@@ -12,7 +12,9 @@ public enum WorkerCommands {
     @SerializedName("stats")
     STATS("stats"),
     @SerializedName("describe")
-    DESCRIBE("describe");
+    DESCRIBE("describe"),
+    @SerializedName("streampredict")
+    STREAMPREDICT("streampredict");
 
     private String command;
 
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/BatchAggregator.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/BatchAggregator.java
index 9aa246bc8a..0d8d050462 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/BatchAggregator.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/BatchAggregator.java
@@ -9,6 +9,7 @@
 import org.pytorch.serve.util.messages.ModelWorkerResponse;
 import org.pytorch.serve.util.messages.Predictions;
 import org.pytorch.serve.util.messages.RequestInput;
+import org.pytorch.serve.util.messages.WorkerCommands;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -33,6 +34,11 @@ public BaseModelRequest getRequest(String threadName, WorkerState state)
         model.pollBatch(
                 threadName, (state == WorkerState.WORKER_MODEL_LOADED) ? 0 : Long.MAX_VALUE, jobs);
 
+        if (model.isUseJobTicket() && jobs.isEmpty()) {
+            model.decNumJobTickets();
+            return req;
+        }
+
         for (Job j : jobs.values()) {
             if (j.isControlCmd()) {
                 if (jobs.size() > 1) {
@@ -48,6 +54,9 @@ public BaseModelRequest getRequest(String threadName, WorkerState state)
                 }
                 return new ModelLoadModelRequest(model, gpuId);
             } else {
+                if (j.getCmd() == WorkerCommands.STREAMPREDICT) {
+                    req.setCommand(WorkerCommands.STREAMPREDICT);
+                }
                 j.setScheduled();
                 req.addRequest(j.getPayload());
             }
@@ -55,13 +64,18 @@ public BaseModelRequest getRequest(String threadName, WorkerState state)
         return req;
     }
 
-    public void sendResponse(ModelWorkerResponse message) {
+    /**
+     * @param message: a response of a batch inference requests
+     * @return - true: either a non-stream response or last stream response is sent - false: a
+     *     stream response (not include the last stream) is sent
+     */
+    public boolean sendResponse(ModelWorkerResponse message) {
+        boolean jobDone = true;
         // TODO: Handle prediction level code
-
         if (message.getCode() == 200) {
             if (jobs.isEmpty()) {
                 // this is from initial load.
-                return;
+                return true;
             }
             for (Predictions prediction : message.getPredictions()) {
                 String jobId = prediction.getRequestId();
@@ -71,26 +85,52 @@ public void sendResponse(ModelWorkerResponse message) {
                     throw new IllegalStateException(
                             "Unexpected job in sendResponse() with 200 status code: " + jobId);
                 }
-                job.response(
-                        prediction.getResp(),
-                        prediction.getContentType(),
-                        prediction.getStatusCode(),
-                        prediction.getReasonPhrase(),
-                        prediction.getHeaders());
+                if (jobDone) {
+                    String streamNext =
+                            prediction
+                                    .getHeaders()
+                                    .get(
+                                            org.pytorch.serve.util.messages.RequestInput
+                                                    .TS_STREAM_NEXT);
+                    if (streamNext != null && streamNext.equals("true")) {
+                        jobDone = false;
+                    }
+                }
+                if (job.getPayload().getClientExpireTS() > System.currentTimeMillis()) {
+                    job.response(
+                            prediction.getResp(),
+                            prediction.getContentType(),
+                            prediction.getStatusCode(),
+                            prediction.getReasonPhrase(),
+                            prediction.getHeaders());
+                } else {
+                    logger.warn(
+                            "Drop response for inference request {} due to client timeout",
+                            job.getPayload().getRequestId());
+                }
             }
 
         } else {
             for (Map.Entry<String, Job> j : jobs.entrySet()) {
-
                 if (j.getValue() == null) {
                     throw new IllegalStateException(
                             "Unexpected job in sendResponse() with non 200 status code: "
                                     + j.getKey());
                 }
-                j.getValue().sendError(message.getCode(), message.getMessage());
+                Job job = j.getValue();
+                if (job.getPayload().getClientExpireTS() > System.currentTimeMillis()) {
+                    job.sendError(message.getCode(), message.getMessage());
+                } else {
+                    logger.warn(
+                            "Drop error response for inference request {} due to client timeout",
+                            job.getPayload().getRequestId());
+                }
             }
         }
-        jobs.clear();
+        if (jobDone) {
+            jobs.clear();
+        }
+        return jobDone;
     }
 
     public void sendError(BaseModelRequest message, String error, int status) {
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java
index 370c3a40cf..5ab59819bf 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java
@@ -2,18 +2,21 @@
 
 import com.google.gson.JsonObject;
 import java.io.File;
+import java.util.Collections;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.LinkedBlockingDeque;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.locks.ReentrantLock;
 import org.apache.commons.io.FilenameUtils;
 import org.pytorch.serve.archive.model.ModelArchive;
+import org.pytorch.serve.archive.model.ModelConfig;
 import org.pytorch.serve.job.Job;
 import org.pytorch.serve.util.ConfigManager;
+import org.pytorch.serve.util.PriorityLinkedBlockingDeque;
 import org.pytorch.serve.util.messages.WorkerCommands;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -21,12 +24,12 @@
 public class Model {
 
     public static final String DEFAULT_DATA_QUEUE = "DATA_QUEUE";
-
     public static final String MIN_WORKERS = "minWorkers";
     public static final String MAX_WORKERS = "maxWorkers";
     public static final String BATCH_SIZE = "batchSize";
     public static final String MAX_BATCH_DELAY = "maxBatchDelay";
     public static final String RESPONSE_TIMEOUT = "responseTimeout";
+    public static final String PARALLEL_LEVEL = "parallelLevel";
     public static final String DEFAULT_VERSION = "defaultVersion";
     public static final String MAR_NAME = "marName";
 
@@ -37,26 +40,95 @@ public class Model {
     private int maxWorkers;
     private int batchSize;
     private int maxBatchDelay;
+    private int parallelLevel = 1;
+    private long maxRetryTimeoutInMill = 5 * 60 * 1000;
+    private long clientTimeoutInMills;
+    private ModelConfig.ParallelType parallelType = ModelConfig.ParallelType.NONE;
+    private ModelConfig.DeviceType deviceType =
+            ConfigManager.getInstance().getNumberOfGpu() > 0
+                    ? ModelConfig.DeviceType.GPU
+                    : ModelConfig.DeviceType.CPU;
+    private List<Integer> deviceIds;
+    private int numCores;
     private ReentrantLock lock;
     private int responseTimeout;
+    private int queueSize;
+    private float highPrioProb;
     private ModelVersionName modelVersionName;
-
+    private AtomicInteger gpuCounter = new AtomicInteger(0);
+    private boolean hasCfgDeviceIds;
     private boolean isWorkflowModel;
 
     // Total number of subsequent inference request failures
     private AtomicInteger failedInfReqs;
 
     // Per worker thread job queue. This separates out the control queue from data queue
-    private ConcurrentMap<String, LinkedBlockingDeque<Job>> jobsDb;
+    private ConcurrentMap<String, PriorityLinkedBlockingDeque<Job>> jobsDb;
 
-    public Model(ModelArchive modelArchive, int queueSize) {
+    private boolean useJobTicket;
+    private AtomicInteger numJobTickets;
+
+    public Model(ModelArchive modelArchive, int queueSize, float highPrioProb) {
         this.modelArchive = modelArchive;
-        batchSize = 1;
-        maxBatchDelay = 100;
+        if (modelArchive != null && modelArchive.getModelConfig() != null) {
+            if (modelArchive.getModelConfig().getParallelLevel() > 1
+                    && modelArchive.getModelConfig().getParallelType()
+                            != ModelConfig.ParallelType.NONE) {
+                parallelLevel = modelArchive.getModelConfig().getParallelLevel();
+                parallelType = modelArchive.getModelConfig().getParallelType();
+            }
+            if (modelArchive.getModelConfig().getDeviceType() != ModelConfig.DeviceType.NONE) {
+                deviceType =
+                        (modelArchive.getModelConfig().getDeviceType() == ModelConfig.DeviceType.GPU
+                                        && ConfigManager.getInstance().getNumberOfGpu() > 0)
+                                ? ModelConfig.DeviceType.GPU
+                                : ModelConfig.DeviceType.CPU;
+            }
+
+            deviceIds = modelArchive.getModelConfig().getDeviceIds();
+            if (deviceIds != null && deviceIds.size() > 0) {
+                hasCfgDeviceIds = true;
+                for (Integer deviceId : deviceIds) {
+                    if (deviceId < 0 || deviceId >= ConfigManager.getInstance().getNumberOfGpu()) {
+                        logger.warn("Invalid deviceId:{}, ignore deviceIds list", deviceId);
+                        deviceIds = null;
+                        hasCfgDeviceIds = false;
+                        break;
+                    }
+                }
+            }
+            maxRetryTimeoutInMill = modelArchive.getModelConfig().getMaxRetryTimeoutInSec() * 1000;
+            clientTimeoutInMills = modelArchive.getModelConfig().getClientTimeoutInMills();
+            if (modelArchive.getModelConfig().getJobQueueSize() > 0) {
+                // overwrite the queueSize defined on config.property
+                queueSize = modelArchive.getModelConfig().getJobQueueSize();
+            }
+            useJobTicket = modelArchive.getModelConfig().isUseJobTicket();
+        } else {
+            batchSize = 1;
+            maxBatchDelay = 100;
+        }
+
+        if (ConfigManager.getInstance().getNumberOfGpu() > 0
+                && deviceType != ModelConfig.DeviceType.CPU) {
+            numCores =
+                    hasCfgDeviceIds
+                            ? deviceIds.size()
+                            : ConfigManager.getInstance().getNumberOfGpu();
+        }
+
+        this.queueSize = queueSize;
+        this.highPrioProb = highPrioProb;
+        // TODO Simon: These two are commented out for now, as we set the further above
+        // verify that this is correct
+        // batchSize = 1;
+        //maxBatchDelay = 100;
+
         jobsDb = new ConcurrentHashMap<>();
         // Always have a queue for data
-        jobsDb.putIfAbsent(DEFAULT_DATA_QUEUE, new LinkedBlockingDeque<>(queueSize));
+        jobsDb.putIfAbsent(DEFAULT_DATA_QUEUE, new PriorityLinkedBlockingDeque<>(this.queueSize, this.highPrioProb));
         failedInfReqs = new AtomicInteger(0);
+        numJobTickets = new AtomicInteger(0);
         lock = new ReentrantLock();
         modelVersionName =
                 new ModelVersionName(
@@ -73,6 +145,9 @@ public JsonObject getModelState(boolean isDefaultVersion) {
         modelInfo.addProperty(BATCH_SIZE, getBatchSize());
         modelInfo.addProperty(MAX_BATCH_DELAY, getMaxBatchDelay());
         modelInfo.addProperty(RESPONSE_TIMEOUT, getResponseTimeout());
+        if (parallelLevel > 1) {
+            modelInfo.addProperty(PARALLEL_LEVEL, parallelLevel);
+        }
 
         return modelInfo;
     }
@@ -83,6 +158,9 @@ public void setModelState(JsonObject modelInfo) {
         maxBatchDelay = modelInfo.get(MAX_BATCH_DELAY).getAsInt();
         responseTimeout = modelInfo.get(RESPONSE_TIMEOUT).getAsInt();
         batchSize = modelInfo.get(BATCH_SIZE).getAsInt();
+        if (modelInfo.get(PARALLEL_LEVEL) != null) {
+            parallelLevel = modelInfo.get(PARALLEL_LEVEL).getAsInt();
+        }
     }
 
     public String getModelName() {
@@ -150,9 +228,9 @@ public void setWorkflowModel(boolean workflowModel) {
     }
 
     public void addJob(String threadId, Job job) {
-        LinkedBlockingDeque<Job> blockingDeque = jobsDb.get(threadId);
+        PriorityLinkedBlockingDeque<Job> blockingDeque = jobsDb.get(threadId);
         if (blockingDeque == null) {
-            blockingDeque = new LinkedBlockingDeque<>();
+            blockingDeque = new PriorityLinkedBlockingDeque<>(this.queueSize, this.highPrioProb);
             jobsDb.put(threadId, blockingDeque);
         }
         blockingDeque.offer(job);
@@ -165,6 +243,10 @@ public void removeJobQueue(String threadId) {
     }
 
     public boolean addJob(Job job) {
+        if (isUseJobTicket() && !getJobTickets()) {
+            logger.info("There are no job tickets");
+            return false;
+        }
         return jobsDb.get(DEFAULT_DATA_QUEUE).offer(job);
     }
 
@@ -183,7 +265,7 @@ public void pollBatch(String threadId, long waitTime, Map<String, Job> jobsRepo)
                     "The jobs repo provided contains stale jobs. Clear them!!");
         }
 
-        LinkedBlockingDeque<Job> jobsQueue = jobsDb.get(threadId);
+        PriorityLinkedBlockingDeque<Job> jobsQueue = jobsDb.get(threadId);
         if (jobsQueue != null && !jobsQueue.isEmpty()) {
             Job j = jobsQueue.poll(waitTime, TimeUnit.MILLISECONDS);
             if (j != null) {
@@ -193,6 +275,9 @@ public void pollBatch(String threadId, long waitTime, Map<String, Job> jobsRepo)
         }
 
         try {
+            if (isUseJobTicket()) {
+                incNumJobTickets();
+            }
             lock.lockInterruptibly();
             long maxDelay = maxBatchDelay;
             jobsQueue = jobsDb.get(DEFAULT_DATA_QUEUE);
@@ -201,8 +286,9 @@ public void pollBatch(String threadId, long waitTime, Map<String, Job> jobsRepo)
             logger.trace("get first job: {}", Objects.requireNonNull(j).getJobId());
 
             jobsRepo.put(j.getJobId(), j);
-            // describe request job batch size always is 1
-            if (j.getCmd() == WorkerCommands.DESCRIBE) {
+            // batch size always is 1 for describe request job and stream prediction request job
+            if (j.getCmd() == WorkerCommands.DESCRIBE
+                    || j.getCmd() == WorkerCommands.STREAMPREDICT) {
                 return;
             }
             long begin = System.currentTimeMillis();
@@ -212,15 +298,23 @@ public void pollBatch(String threadId, long waitTime, Map<String, Job> jobsRepo)
                     break;
                 }
                 long end = System.currentTimeMillis();
-                // describe request job batch size always is 1
-                if (j.getCmd() == WorkerCommands.DESCRIBE) {
+                // job batch size always is 1 when request is
+                // describe or stream prediction
+                if (j.getCmd() == WorkerCommands.DESCRIBE
+                        || j.getCmd() == WorkerCommands.STREAMPREDICT) {
                     // Add the job back into the jobsQueue
                     jobsQueue.addFirst(j);
                     break;
                 }
                 maxDelay -= end - begin;
                 begin = end;
-                jobsRepo.put(j.getJobId(), j);
+                if (j.getPayload().getClientExpireTS() > System.currentTimeMillis()) {
+                    jobsRepo.put(j.getJobId(), j);
+                } else {
+                    logger.warn(
+                            "Drop inference request {} due to client timeout",
+                            j.getPayload().getRequestId());
+                }
                 if (maxDelay <= 0) {
                     break;
                 }
@@ -248,4 +342,77 @@ public int getResponseTimeout() {
     public void setResponseTimeout(int responseTimeout) {
         this.responseTimeout = responseTimeout;
     }
+
+    public List<Integer> getDeviceIds() {
+        return this.deviceIds;
+    }
+
+    public void setDeviceIds(List<Integer> deviceIds) {
+        Collections.copy(this.deviceIds, deviceIds);
+    }
+
+    public int getParallelLevel() {
+        return this.parallelLevel;
+    }
+
+    public ModelConfig.ParallelType getParallelType() {
+        return this.parallelType;
+    }
+
+    public ModelConfig.DeviceType getDeviceType() {
+        return this.deviceType;
+    }
+
+    public int getNumCores() {
+        return this.numCores;
+    }
+
+    public AtomicInteger getGpuCounter() {
+        return gpuCounter;
+    }
+
+    public boolean isHasCfgDeviceIds() {
+        return hasCfgDeviceIds;
+    }
+
+    public long getMaxRetryTimeoutInMill() {
+        return maxRetryTimeoutInMill;
+    }
+
+    public void setMaxRetryTimeoutInMill(long maxRetryTimeoutInMill) {
+        this.maxRetryTimeoutInMill = maxRetryTimeoutInMill;
+    }
+
+    public long getClientTimeoutInMills() {
+        return clientTimeoutInMills;
+    }
+
+    public void setClientTimeoutInMills(long clientTimeoutInMills) {
+        this.clientTimeoutInMills = clientTimeoutInMills;
+    }
+
+    public String getQueueStatusString() {
+        return jobsDb.get(DEFAULT_DATA_QUEUE).getQueueStatusString();
+    }
+
+    public boolean isUseJobTicket() {
+        return useJobTicket;
+    }
+
+    public int incNumJobTickets() {
+        return this.numJobTickets.incrementAndGet();
+    }
+
+    public int decNumJobTickets() {
+        return this.numJobTickets.decrementAndGet();
+    }
+
+    public synchronized boolean getJobTickets() {
+        if (this.numJobTickets.get() == 0) {
+            return false;
+        }
+
+        this.numJobTickets.decrementAndGet();
+        return true;
+    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java
index 7245241cab..4c3a8282a7 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java
@@ -21,6 +21,7 @@
 import org.pytorch.serve.archive.DownloadArchiveException;
 import org.pytorch.serve.archive.model.Manifest;
 import org.pytorch.serve.archive.model.ModelArchive;
+import org.pytorch.serve.archive.model.ModelConfig;
 import org.pytorch.serve.archive.model.ModelException;
 import org.pytorch.serve.archive.model.ModelNotFoundException;
 import org.pytorch.serve.archive.model.ModelVersionNotFoundException;
@@ -52,10 +53,6 @@ private ModelManager(ConfigManager configManager, WorkLoadManager wlm) {
         this.startupModels = new HashSet<>();
     }
 
-    public ScheduledExecutorService getScheduler() {
-        return scheduler;
-    }
-
     public static void init(ConfigManager configManager, WorkLoadManager wlm) {
         modelManager = new ModelManager(configManager, wlm);
     }
@@ -64,6 +61,10 @@ public static ModelManager getInstance() {
         return modelManager;
     }
 
+    public ScheduledExecutorService getScheduler() {
+        return scheduler;
+    }
+
     public ModelArchive registerModel(String url, String defaultModelName)
             throws ModelException, IOException, InterruptedException, DownloadArchiveException {
         return registerModel(
@@ -81,7 +82,8 @@ public ModelArchive registerModel(String url, String defaultModelName)
     }
 
     public void registerAndUpdateModel(String modelName, JsonObject modelInfo)
-            throws ModelException, IOException, InterruptedException, DownloadArchiveException {
+            throws ModelException, IOException, InterruptedException, DownloadArchiveException,
+                    WorkerInitializationException {
 
         boolean defaultVersion = modelInfo.get(Model.DEFAULT_VERSION).getAsBoolean();
         String url = modelInfo.get(Model.MAR_NAME).getAsString();
@@ -264,33 +266,74 @@ private Model createModel(
             int maxBatchDelay,
             int responseTimeout,
             boolean isWorkflowModel) {
-        Model model = new Model(archive, configManager.getJobQueueSize());
-
-        model.setBatchSize(
-                configManager.getJsonIntValue(
-                        archive.getModelName(),
-                        archive.getModelVersion(),
-                        Model.BATCH_SIZE,
-                        batchSize));
-        model.setMaxBatchDelay(
-                configManager.getJsonIntValue(
-                        archive.getModelName(),
-                        archive.getModelVersion(),
-                        Model.MAX_BATCH_DELAY,
-                        maxBatchDelay));
-        model.setResponseTimeout(
-                configManager.getJsonIntValue(
-                        archive.getModelName(),
-                        archive.getModelVersion(),
-                        Model.RESPONSE_TIMEOUT,
-                        responseTimeout));
+        Model model = new Model(archive, configManager.getJobQueueSize(), configManager.getHighPrioProb());
+
+        if (archive.getModelConfig() != null) {
+            int marBatchSize = archive.getModelConfig().getBatchSize();
+            batchSize =
+                    marBatchSize > 0
+                            ? marBatchSize
+                            : configManager.getJsonIntValue(
+                                    archive.getModelName(),
+                                    archive.getModelVersion(),
+                                    Model.BATCH_SIZE,
+                                    batchSize);
+        } else {
+            batchSize =
+                    configManager.getJsonIntValue(
+                            archive.getModelName(),
+                            archive.getModelVersion(),
+                            Model.BATCH_SIZE,
+                            batchSize);
+        }
+        model.setBatchSize(batchSize);
+
+        if (archive.getModelConfig() != null) {
+            int marMaxBatchDelay = archive.getModelConfig().getMaxBatchDelay();
+            maxBatchDelay =
+                    marMaxBatchDelay > 0
+                            ? marMaxBatchDelay
+                            : configManager.getJsonIntValue(
+                                    archive.getModelName(),
+                                    archive.getModelVersion(),
+                                    Model.MAX_BATCH_DELAY,
+                                    maxBatchDelay);
+        } else {
+            maxBatchDelay =
+                    configManager.getJsonIntValue(
+                            archive.getModelName(),
+                            archive.getModelVersion(),
+                            Model.MAX_BATCH_DELAY,
+                            maxBatchDelay);
+        }
+        model.setMaxBatchDelay(maxBatchDelay);
+
+        if (archive.getModelConfig() != null) {
+            int marResponseTimeout = archive.getModelConfig().getResponseTimeout();
+            responseTimeout =
+                    marResponseTimeout > 0
+                            ? marResponseTimeout
+                            : configManager.getJsonIntValue(
+                                    archive.getModelName(),
+                                    archive.getModelVersion(),
+                                    Model.RESPONSE_TIMEOUT,
+                                    responseTimeout);
+        } else {
+            responseTimeout =
+                    configManager.getJsonIntValue(
+                            archive.getModelName(),
+                            archive.getModelVersion(),
+                            Model.RESPONSE_TIMEOUT,
+                            responseTimeout);
+        }
+        model.setResponseTimeout(responseTimeout);
         model.setWorkflowModel(isWorkflowModel);
 
         return model;
     }
 
     private Model createModel(ModelArchive archive, JsonObject modelInfo) {
-        Model model = new Model(archive, configManager.getJobQueueSize());
+        Model model = new Model(archive, configManager.getJobQueueSize(), configManager.getHighPrioProb());
         model.setModelState(modelInfo);
         model.setWorkflowModel(false);
 
@@ -379,7 +422,7 @@ public void setDefaultVersion(String modelName, String newModelVersion)
 
     private CompletableFuture<Integer> updateModel(
             String modelName, String versionId, boolean isStartup)
-            throws ModelVersionNotFoundException {
+            throws ModelVersionNotFoundException, WorkerInitializationException {
         Model model = getVersionModel(modelName, versionId);
         return updateModel(
                 modelName,
@@ -397,14 +440,39 @@ public CompletableFuture<Integer> updateModel(
             int maxWorkers,
             boolean isStartup,
             boolean isCleanUp)
-            throws ModelVersionNotFoundException {
+            throws ModelVersionNotFoundException, WorkerInitializationException {
         Model model = getVersionModel(modelName, versionId);
 
         if (model == null) {
             throw new ModelVersionNotFoundException(
                     "Model version: " + versionId + " does not exist for model: " + modelName);
         }
-
+        if (model.getParallelLevel() > 1 && model.getDeviceType() == ModelConfig.DeviceType.GPU) {
+            /**
+             * Current capacity check for LMI is based on single node. TODO: multiple nodes check
+             * will be based on --proc-per-node + numCores.
+             */
+            int capacity = model.getNumCores() / model.getParallelLevel();
+            if (capacity == 0) {
+                logger.error(
+                        "there are no enough gpu devices to support this parallelLever: {}",
+                        model.getParallelLevel());
+                throw new WorkerInitializationException(
+                        "No enough gpu devices for model:"
+                                + modelName
+                                + " parallelLevel:"
+                                + model.getParallelLevel());
+            } else {
+                minWorkers = minWorkers > capacity ? capacity : minWorkers;
+                maxWorkers = maxWorkers > capacity ? capacity : maxWorkers;
+                logger.info(
+                        "model {} set minWorkers: {}, maxWorkers: {} for parallelLevel: {} ",
+                        modelName,
+                        minWorkers,
+                        maxWorkers,
+                        model.getParallelLevel());
+            }
+        }
         model.setMinWorkers(minWorkers);
         model.setMaxWorkers(maxWorkers);
         logger.debug("updateModel: {}, count: {}", modelName, minWorkers);
@@ -423,7 +491,7 @@ private Model getVersionModel(String modelName, String versionId) {
 
     public CompletableFuture<Integer> updateModel(
             String modelName, String versionId, int minWorkers, int maxWorkers)
-            throws ModelVersionNotFoundException {
+            throws ModelVersionNotFoundException, WorkerInitializationException {
         return updateModel(modelName, versionId, minWorkers, maxWorkers, false, false);
     }
 
@@ -519,4 +587,8 @@ public Set<Entry<String, ModelVersionedRefs>> getAllModels() {
     public int getNumRunningWorkers(ModelVersionName modelVersionName) {
         return wlm.getNumRunningWorkers(modelVersionName);
     }
+
+    public int getNumHealthyWorkers(ModelVersionName modelVersionName) {
+        return wlm.getNumHealthyWorkers(modelVersionName);
+    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkLoadManager.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkLoadManager.java
index c8f8b1d6a6..0112c1d4dd 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkLoadManager.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkLoadManager.java
@@ -16,28 +16,31 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import org.pytorch.serve.snapshot.SnapshotManager;
 import org.pytorch.serve.util.ConfigManager;
+import org.pytorch.serve.util.GPUManager;
 import org.pytorch.serve.util.OSUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class WorkLoadManager {
 
+    private static final Logger logger = LoggerFactory.getLogger(WorkLoadManager.class);
     private ExecutorService threadPool;
-
     private ConcurrentHashMap<ModelVersionName, List<WorkerThread>> workers;
-
     private ConfigManager configManager;
+    private GPUManager gpuManager;
     private EventLoopGroup backendGroup;
     private AtomicInteger port;
+    private AtomicInteger distributionPort;
     private AtomicInteger gpuCounter;
 
-    private static final Logger logger = LoggerFactory.getLogger(WorkLoadManager.class);
-
-    public WorkLoadManager(ConfigManager configManager, EventLoopGroup backendGroup) {
+    public WorkLoadManager(ConfigManager configManager, GPUManager gpuManager, EventLoopGroup backendGroup) {
         this.configManager = configManager;
+        this.gpuManager = gpuManager;
         this.backendGroup = backendGroup;
         this.port = new AtomicInteger(configManager.getInitialWorkerPort());
+        this.distributionPort = new AtomicInteger(configManager.getInitialDistributionPort());
         this.gpuCounter = new AtomicInteger(0);
+
         threadPool = Executors.newCachedThreadPool();
         workers = new ConcurrentHashMap<>();
     }
@@ -85,6 +88,21 @@ public int getNumRunningWorkers(ModelVersionName modelVersionName) {
         return numWorking;
     }
 
+    public int getNumHealthyWorkers(ModelVersionName modelVersionName) {
+        int numHealthy = 0;
+        List<WorkerThread> threads = workers.getOrDefault(modelVersionName, null);
+
+        if (threads != null) {
+            for (WorkerThread thread : threads) {
+                if (thread.isHealthy()) {
+                    numHealthy += 1;
+                }
+            }
+        }
+
+        return numHealthy;
+    }
+
     /**
      * Checks if cpu_launcher is enabled and currentWorkers > 0 (i.e., not initializing workers).
      * Workers are restarted so that when dynamically scaling the number of workers, cores that were
@@ -192,21 +210,41 @@ public CompletableFuture<Integer> modelChanged(
     private void addThreads(
             List<WorkerThread> threads, Model model, int count, CompletableFuture<Integer> future) {
         WorkerStateListener listener = new WorkerStateListener(future, count);
-        int maxGpu = configManager.getNumberOfGpu();
+        int maxGpu = model.getNumCores();
         for (int i = 0; i < count; ++i) {
             int gpuId = -1;
 
             if (maxGpu > 0) {
-                gpuId = gpuCounter.accumulateAndGet(maxGpu, (prev, maxGpuId) -> ++prev % maxGpuId);
+                if (model.isHasCfgDeviceIds() || model.getParallelLevel() > 1) {
+                    gpuId =
+                            model.getGpuCounter()
+                                    .getAndAccumulate(
+                                            maxGpu,
+                                            (prev, maxGpuId) ->
+                                                    (prev + model.getParallelLevel()) % maxGpuId);
+                    if (model.getParallelLevel() == 1) {
+                        gpuId = model.getDeviceIds().get(gpuId);
+                    }
+                } else {
+                    gpuId =
+                            gpuCounter.accumulateAndGet(
+                                    maxGpu, (prev, maxGpuId) -> ++prev % maxGpuId);
+                }
             }
 
             BatchAggregator aggregator = new BatchAggregator(model);
+            int currentPort =
+                    model.getParallelLevel() > 1
+                            ? configManager.isDebug()
+                                    ? distributionPort.get()
+                                    : distributionPort.getAndAdd(model.getParallelLevel())
+                            : configManager.isDebug() ? port.get() : port.getAndIncrement();
             WorkerThread thread =
                     new WorkerThread(
                             configManager,
                             backendGroup,
-                            configManager.isDebug() ? port.get() : port.getAndIncrement(),
-                            gpuId,
+                            currentPort,
+                            gpuManager,
                             model,
                             aggregator,
                             listener);
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java
index b4928a7143..158941150d 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerLifeCycle.java
@@ -5,11 +5,18 @@
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Scanner;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.pytorch.serve.archive.model.ModelConfig;
+import org.pytorch.serve.metrics.Dimension;
 import org.pytorch.serve.metrics.Metric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.ConfigManager;
 import org.pytorch.serve.util.Connector;
 import org.pytorch.serve.util.messages.EnvironmentUtils;
@@ -30,7 +37,6 @@ public class WorkerLifeCycle {
     private Connector connector;
     private ReaderThread errReader;
     private ReaderThread outReader;
-    private String launcherArgs;
     private int numWorker;
     private int currNumRunningWorkers;
 
@@ -45,10 +51,11 @@ public Process getProcess() {
         return process;
     }
 
-    public ArrayList<String> launcherArgsToList() {
+    public ArrayList<String> launcherArgsToList(String launcherArgs) {
         ArrayList<String> arrlist = new ArrayList<String>();
         arrlist.add("-m");
-        arrlist.add("intel_extension_for_pytorch.cpu.launch");
+        arrlist.add("torch.backends.xeon.run_cpu");
+
         if (launcherArgs != null && launcherArgs.length() > 1) {
             String[] argarray = launcherArgs.split(" ");
             for (int i = 0; i < argarray.length; i++) {
@@ -58,22 +65,25 @@ public ArrayList<String> launcherArgsToList() {
         return arrlist;
     }
 
-    public boolean isLauncherAvailable()
+    public boolean isLauncherAvailable(String launcherArgs)
             throws WorkerInitializationException, InterruptedException {
         boolean launcherAvailable = false;
-        try {
-            ArrayList<String> cmd = new ArrayList<String>();
-            cmd.add("python");
-            ArrayList<String> args = launcherArgsToList();
-            cmd.addAll(args);
-            cmd.add("--no_python");
-            // try launching dummy command to check launcher availability
-            String dummyCmd = "hostname";
-            cmd.add(dummyCmd);
-
-            String[] cmdList = new String[cmd.size()];
-            cmdList = cmd.toArray(cmdList);
 
+        ArrayList<String> cmd = new ArrayList<String>();
+        cmd.add("python");
+        ArrayList<String> args = launcherArgsToList(launcherArgs);
+        cmd.addAll(args);
+        cmd.add("--no_python");
+        // try launching dummy command to check launcher availability
+        String dummyCmd = "hostname";
+        cmd.add(dummyCmd);
+
+        String[] cmdList = new String[cmd.size()];
+        cmdList = cmd.toArray(cmdList);
+
+        logger.debug("launcherAvailable cmdline: {}", cmd.toString());
+
+        try {
             Process processLauncher = Runtime.getRuntime().exec(cmdList);
             int ret = processLauncher.waitFor();
             launcherAvailable = (ret == 0);
@@ -83,7 +93,8 @@ public boolean isLauncherAvailable()
         return launcherAvailable;
     }
 
-    public void startWorker(int port) throws WorkerInitializationException, InterruptedException {
+    public void startWorker(int port, String deviceIds)
+            throws WorkerInitializationException, InterruptedException {
         File workingDir = new File(configManager.getModelServerHome());
         File modelPath;
         setPort(port);
@@ -93,28 +104,40 @@ public void startWorker(int port) throws WorkerInitializationException, Interrup
             throw new WorkerInitializationException("Failed get TS home directory", e);
         }
 
-        ArrayList<String> argl = new ArrayList<String>();
-        argl.add(EnvironmentUtils.getPythonRunTime(model));
+        ArrayList<String> argl = new ArrayList<>();
+        ArrayList<String> envp = new ArrayList<>();
+        envp.addAll(
+                Arrays.asList(
+                        EnvironmentUtils.getEnvString(
+                                workingDir.getAbsolutePath(),
+                                modelPath.getAbsolutePath(),
+                                model.getModelArchive().getManifest().getModel().getHandler())));
+
+        if (model.getParallelLevel() > 1) {
+            attachRunner(argl, envp, port, deviceIds);
+        } else if (model.getParallelLevel() == 1) {
+            argl.add(EnvironmentUtils.getPythonRunTime(model));
+        }
 
         if (configManager.isCPULauncherEnabled()) {
-            launcherArgs = configManager.getCPULauncherArgs();
-            boolean launcherAvailable = isLauncherAvailable();
+            String launcherArgs = configManager.getCPULauncherArgs();
+            boolean launcherAvailable = isLauncherAvailable(launcherArgs);
             if (launcherAvailable) {
-                ArrayList<String> args = launcherArgsToList();
+                ArrayList<String> args = launcherArgsToList(launcherArgs);
                 argl.addAll(args);
 
                 // multi-worker core pinning
                 if (this.numWorker > 1) {
                     argl.add("--ninstances");
                     argl.add(String.valueOf(this.numWorker));
-                    argl.add("--instance_idx");
+                    argl.add("--rank");
                     // instance_idx is 0-indexed
                     argl.add(String.valueOf(this.currNumRunningWorkers));
                 }
 
             } else {
                 logger.warn(
-                        "CPU launcher is enabled but launcher is not available. Proceeding without launcher.");
+                        "torch.backends.xeon.run_cpu is not available. Proceeding without worker core pinning. For better performance, please make sure torch.backends.xeon.run_cpu is available.");
             }
         }
 
@@ -127,20 +150,15 @@ public void startWorker(int port) throws WorkerInitializationException, Interrup
         argl.add("--metrics-config");
         argl.add(configManager.getMetricsConfigPath());
 
-        String[] envp =
-                EnvironmentUtils.getEnvString(
-                        workingDir.getAbsolutePath(),
-                        modelPath.getAbsolutePath(),
-                        model.getModelArchive().getManifest().getModel().getHandler());
-
         try {
-            latch = new CountDownLatch(1);
+            latch = new CountDownLatch(model.getParallelLevel());
 
             String[] args = argl.toArray(new String[argl.size()]);
+            String[] envs = envp.toArray(new String[envp.size()]);
             logger.debug("Worker cmdline: {}", argl.toString());
 
             synchronized (this) {
-                process = Runtime.getRuntime().exec(args, envp, modelPath);
+                process = Runtime.getRuntime().exec(args, envs, modelPath);
 
                 String threadName =
                         "W-" + port + '-' + model.getModelVersionName().getVersionedModelName();
@@ -166,6 +184,39 @@ public void startWorker(int port) throws WorkerInitializationException, Interrup
         }
     }
 
+    private void attachRunner(
+            ArrayList<String> argl, List<String> envp, int port, String deviceIds) {
+        envp.add("LOGLEVEL=INFO");
+        if (deviceIds != null) {
+            envp.add("CUDA_VISIBLE_DEVICES=" + deviceIds);
+        }
+        ModelConfig.TorchRun torchRun = model.getModelArchive().getModelConfig().getTorchRun();
+        envp.add(String.format("OMP_NUM_THREADS=%d", torchRun.getOmpNumberThreads()));
+        argl.add("torchrun");
+        argl.add("--nnodes");
+        argl.add(String.valueOf(torchRun.getNnodes()));
+        argl.add("--nproc-per-node");
+        argl.add(String.valueOf(torchRun.getNprocPerNode()));
+        argl.add("--log-dir");
+        argl.add(ConfigManager.getInstance().getTorchRunLogDir());
+        argl.add("--rdzv-backend");
+        argl.add(torchRun.getRdzvBackend());
+        if (torchRun.getRdzvEndpoint() != null) {
+            argl.add("--rdzv-endpoint");
+            argl.add(torchRun.getRdzvEndpoint());
+        }
+        argl.add("--rdzv-id");
+        argl.add(String.format("%s_%d", model.getModelName(), port));
+        if (torchRun.getMasterAddr() != null) {
+            argl.add("--master-addr");
+            argl.add(torchRun.getMasterAddr());
+            argl.add("--master-port");
+            argl.add(String.valueOf(torchRun.getMasterPort()));
+        }
+        argl.add("--max-restarts");
+        argl.add(String.valueOf(1));
+    }
+
     public synchronized void terminateIOStreams() {
         if (errReader != null) {
             logger.warn("terminateIOStreams() threadName={}", errReader.getName());
@@ -210,21 +261,26 @@ private synchronized void setPort(int port) {
     }
 
     private static final class ReaderThread extends Thread {
-
+        private static final Pattern METRIC_PATTERN =
+                Pattern.compile("^(INFO > )?(\\[METRICS])(.*)");
+        private static final Pattern WORKER_START_PATTERN =
+                Pattern.compile("^(INFO > )?(Torch worker started.)$");
+        private static final Pattern WORKER_PID_PATTERN =
+                Pattern.compile("^(INFO > )?(\\[PID])(\\d+)$");
+        private static final Logger loggerModelOutput =
+                LoggerFactory.getLogger(ConfigManager.MODEL_LOGGER);
+        private final MetricCache metricCache;
         private InputStream is;
         private boolean error;
         private WorkerLifeCycle lifeCycle;
         private AtomicBoolean isRunning = new AtomicBoolean(true);
-        private static final Logger loggerModelMetrics =
-                LoggerFactory.getLogger(ConfigManager.MODEL_METRICS_LOGGER);
-        private static final Logger loggerModelOutput =
-                LoggerFactory.getLogger(ConfigManager.MODEL_LOGGER);
 
         public ReaderThread(String name, InputStream is, boolean error, WorkerLifeCycle lifeCycle) {
             super(name + (error ? "-stderr" : "-stdout"));
             this.is = is;
             this.error = error;
             this.lifeCycle = lifeCycle;
+            this.metricCache = MetricCache.getInstance();
         }
 
         public void terminate() {
@@ -239,20 +295,50 @@ public void run() {
                     if (result == null) {
                         break;
                     }
-                    if (result.startsWith("[METRICS]")) {
-                        Metric parsedMetric = Metric.parse(result.substring("[METRICS]".length()));
+
+                    Matcher matcher = METRIC_PATTERN.matcher(result);
+                    if (matcher.matches()) {
+                        logger.info("result={}, pattern={}", result, matcher.group(2));
+                        Metric parsedMetric = Metric.parse(matcher.group(3));
                         if (parsedMetric != null) {
-                            loggerModelMetrics.info(parsedMetric.toString());
+                            if (this.metricCache.getMetricBackend(parsedMetric.getMetricName())
+                                    != null) {
+                                try {
+                                    List<String> dimensionValues = new ArrayList<String>();
+                                    for (Dimension dimension : parsedMetric.getDimensions()) {
+                                        dimensionValues.add(dimension.getValue());
+                                    }
+                                    // Hostname is added as a dimension by default to backend
+                                    // metrics
+                                    dimensionValues.add(parsedMetric.getHostName());
+                                    this.metricCache
+                                            .getMetricBackend(parsedMetric.getMetricName())
+                                            .addOrUpdate(
+                                                    dimensionValues,
+                                                    parsedMetric.getRequestId(),
+                                                    Double.parseDouble(parsedMetric.getValue()));
+                                } catch (Exception e) {
+                                    logger.error(
+                                            "Failed to update backend metric ",
+                                            parsedMetric.getMetricName(),
+                                            ": ",
+                                            e);
+                                }
+                            }
                         } else {
                             logger.error("Failed to parse metrics line: \"{}\".", result);
                         }
                         continue;
                     }
 
-                    if ("Torch worker started.".equals(result)) {
+                    matcher = WORKER_START_PATTERN.matcher(result);
+                    if (matcher.matches()) {
                         lifeCycle.setSuccess(true);
-                    } else if (result.startsWith("[PID]")) {
-                        lifeCycle.setPid(Integer.parseInt(result.substring("[PID]".length())));
+                    } else {
+                        matcher = WORKER_PID_PATTERN.matcher(result);
+                        if (matcher.matches()) {
+                            lifeCycle.setPid(Integer.parseInt(matcher.group(3)));
+                        }
                     }
                     if (error) {
                         loggerModelOutput.warn(result);
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
index cb126452bc..830e2fa163 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
@@ -16,17 +16,23 @@
 import java.net.HttpURLConnection;
 import java.net.SocketAddress;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.UUID;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
+import org.pytorch.serve.archive.model.ModelConfig;
 import org.pytorch.serve.job.Job;
 import org.pytorch.serve.job.RestJob;
-import org.pytorch.serve.metrics.Dimension;
-import org.pytorch.serve.metrics.Metric;
+import org.pytorch.serve.metrics.IMetric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.util.ConfigManager;
+import org.pytorch.serve.util.GPUManager;
 import org.pytorch.serve.util.Connector;
 import org.pytorch.serve.util.codec.ModelRequestEncoder;
 import org.pytorch.serve.util.codec.ModelResponseDecoder;
@@ -41,27 +47,25 @@
 public class WorkerThread implements Runnable {
 
     private static final Logger logger = LoggerFactory.getLogger(WorkerThread.class);
-    private static final Logger loggerTsMetrics =
-            LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
     private static final Logger loggerTelemetryMetrics =
             LoggerFactory.getLogger(ConfigManager.MODEL_SERVER_TELEMETRY_LOGGER);
-
-    private Metric workerLoadTime;
-
     private static final int[] BACK_OFF = {
         0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597
     };
-
     private static final long WORKER_TIMEOUT = 2L;
     private static final ModelRequestEncoder ENCODER =
             new ModelRequestEncoder(ConfigManager.getInstance().getPreferDirectBuffer());
-
+    private final IMetric workerThreadTimeMetric;
+    private final IMetric workerLoadTimeMetric;
+    private final List<String> workerThreadTimeMetricDimensionValues;
+    private final List<String> workerLoadTimeMetricDimensionValues;
     private ConfigManager configManager;
+    private GPUManager gpuManager;
     private EventLoopGroup backendEventGroup;
     private int port;
     private Model model;
 
-    private Channel backendChannel;
+    private ArrayList<Channel> backendChannel = new ArrayList<>();
     private AtomicBoolean running = new AtomicBoolean(true);
 
     private int backoffIdx;
@@ -74,10 +78,39 @@ public class WorkerThread implements Runnable {
     private long startTime;
     private AtomicReference<Thread> currentThread = new AtomicReference<>();
     private String workerId;
-
     private WorkerState state;
-
     private WorkerLifeCycle lifeCycle;
+    private int responseTimeout;
+    private long recoveryStartTS; // 0: default value. no recovery needed, in healthy mode
+
+    public WorkerThread(
+            ConfigManager configManager,
+            EventLoopGroup backendEventGroup,
+            int port,
+            GPUManager gpuManager,
+            Model model,
+            BatchAggregator aggregator,
+            WorkerStateListener listener) {
+        this.workerId = String.valueOf(port); // Unique across all workers.
+        this.configManager = configManager;
+        this.backendEventGroup = backendEventGroup;
+        this.port = port;
+        this.model = model;
+        this.aggregator = aggregator;
+        this.gpuManager = gpuManager;
+        this.gpuId = gpuManager.getGPU(this.workerId);
+        this.listener = listener;
+        startTime = System.currentTimeMillis();
+        lifeCycle = new WorkerLifeCycle(configManager, model);
+        replies = new ArrayBlockingQueue<>(model.getParallelLevel());
+        this.workerThreadTimeMetric =
+                MetricCache.getInstance().getMetricFrontend("WorkerThreadTime");
+        this.workerLoadTimeMetric = MetricCache.getInstance().getMetricFrontend("WorkerLoadTime");
+        this.workerThreadTimeMetricDimensionValues =
+                Arrays.asList("Host", ConfigManager.getInstance().getHostName());
+        this.workerLoadTimeMetricDimensionValues =
+                Arrays.asList(getWorkerName(), "Host", ConfigManager.getInstance().getHostName());
+    }
 
     public WorkerState getState() {
         return state;
@@ -140,37 +173,9 @@ public WorkerLifeCycle getLifeCycle() {
         return lifeCycle;
     }
 
-    public WorkerThread(
-            ConfigManager configManager,
-            EventLoopGroup backendEventGroup,
-            int port,
-            int gpuId,
-            Model model,
-            BatchAggregator aggregator,
-            WorkerStateListener listener) {
-        this.workerId = String.valueOf(port); // Unique across all workers.
-        this.configManager = configManager;
-        this.backendEventGroup = backendEventGroup;
-        this.port = port;
-        this.model = model;
-        this.aggregator = aggregator;
-        this.gpuId = gpuId;
-        this.listener = listener;
-        startTime = System.currentTimeMillis();
-        lifeCycle = new WorkerLifeCycle(configManager, model);
-        replies = new ArrayBlockingQueue<>(1);
-        workerLoadTime =
-                new Metric(
-                        getWorkerName(),
-                        String.valueOf(System.currentTimeMillis()),
-                        "ms",
-                        ConfigManager.getInstance().getHostName(),
-                        new Dimension("Level", "Host"));
-    }
-
     @Override
     public void run() {
-        int responseTimeout = model.getResponseTimeout();
+        responseTimeout = model.getResponseTimeout();
         Thread thread = Thread.currentThread();
         thread.setName(getWorkerName());
         currentThread.set(thread);
@@ -184,28 +189,55 @@ public void run() {
                 req = aggregator.getRequest(workerId, state);
 
                 long wtStartTime = System.currentTimeMillis();
-                logger.info("Flushing req. to backend at: " + wtStartTime);
-                backendChannel.writeAndFlush(req).sync();
-
-                long begin = System.currentTimeMillis();
-                ModelWorkerResponse reply = replies.poll(responseTimeout, TimeUnit.SECONDS);
-
-                long duration = System.currentTimeMillis() - begin;
-                logger.info("Backend response time: {}", duration);
-
-                if (reply != null) {
-                    aggregator.sendResponse(reply);
-                } else if (req.getCommand() != WorkerCommands.DESCRIBE) {
-                    int val = model.incrFailedInfReqs();
-                    logger.error("Number or consecutive unsuccessful inference {}", val);
-                    throw new WorkerInitializationException(
-                            "Backend worker did not respond in given time");
+                logger.info("Flushing req.cmd {} to backend at: {}", req.getCommand(), wtStartTime);
+                int repeats =
+                        (req.getCommand() == WorkerCommands.LOAD)
+                                        || ((req.getCommand() == WorkerCommands.PREDICT
+                                                        || req.getCommand()
+                                                                == WorkerCommands.STREAMPREDICT)
+                                                && model.getParallelLevel() > 1
+                                                && model.getParallelType()
+                                                        != ModelConfig.ParallelType.PP)
+                                ? model.getParallelLevel()
+                                : 1;
+                for (int i = 0; backendChannel.size() > 0 && i < repeats; i++) {
+                    backendChannel.get(i).writeAndFlush(req).sync();
                 }
 
+                boolean isStreaming =
+                        req.getCommand() == WorkerCommands.STREAMPREDICT ? true : false;
+                ModelWorkerResponse reply = null;
+
+                boolean jobDone = false;
+                long totalDuration = 0;
+                do {
+                    long begin = System.currentTimeMillis();
+                    for (int i = 0; i < repeats; i++) {
+                        reply = replies.poll(responseTimeout, TimeUnit.SECONDS);
+                    }
+
+                    long duration = System.currentTimeMillis() - begin;
+
+                    if (reply != null) {
+                        jobDone = aggregator.sendResponse(reply);
+                        logger.debug("sent a reply, jobdone: {}", jobDone);
+                    } else if (req.getCommand() != WorkerCommands.DESCRIBE) {
+                        int val = model.incrFailedInfReqs();
+                        logger.error("Number or consecutive unsuccessful inference {}", val);
+                        throw new WorkerInitializationException(
+                                "Backend worker did not respond in given time");
+                    }
+                    totalDuration += duration;
+                } while (!jobDone);
+                logger.info("Backend response time: {}", totalDuration);
+
                 switch (req.getCommand()) {
                     case PREDICT:
                         model.resetFailedInfReqs();
                         break;
+                    case STREAMPREDICT:
+                        model.resetFailedInfReqs();
+                        break;
                     case LOAD:
                         if (reply.getCode() == 200) {
                             setState(WorkerState.WORKER_MODEL_LOADED, HttpURLConnection.HTTP_OK);
@@ -229,16 +261,16 @@ public void run() {
                         break;
                 }
                 req = null;
-                String workerThreadTime =
-                        String.valueOf(((System.currentTimeMillis() - wtStartTime) - duration));
-                loggerTsMetrics.info(
-                        "{}",
-                        new Metric(
-                                "WorkerThreadTime",
-                                workerThreadTime,
-                                "ms",
-                                ConfigManager.getInstance().getHostName(),
-                                new Dimension("Level", "Host")));
+                double workerThreadTime =
+                        (System.currentTimeMillis() - wtStartTime) - totalDuration;
+                if (this.workerThreadTimeMetric != null) {
+                    try {
+                        this.workerThreadTimeMetric.addOrUpdate(
+                                this.workerThreadTimeMetricDimensionValues, workerThreadTime);
+                    } catch (Exception e) {
+                        logger.error("Failed to update frontend metric WorkerThreadTime: ", e);
+                    }
+                }
             }
         } catch (InterruptedException e) {
             logger.debug("System state is : " + state);
@@ -272,7 +304,9 @@ public void run() {
             // WorkerThread is running in thread pool, the thread will be assigned to next
             // Runnable once this worker is finished. If currentThread keep holding the reference
             // of the thread, currentThread.interrupt() might kill next worker.
-            backendChannel.disconnect();
+            for (int i = 0; backendChannel.size() > 0 && i < model.getParallelLevel(); i++) {
+                backendChannel.get(i).disconnect();
+            }
             currentThread.set(null);
             Integer exitValue = lifeCycle.getExitValue();
 
@@ -285,7 +319,9 @@ public void run() {
             }
             setState(WorkerState.WORKER_STOPPED, status);
             lifeCycle.exit();
-            retry();
+            if (isHealthy()) { // still within maxRetryTimeoutInMill window
+                retry();
+            }
         }
     }
 
@@ -303,73 +339,81 @@ public void setMemory(long memory) {
 
     private void connect() throws WorkerInitializationException, InterruptedException {
         if (!configManager.isDebug()) {
-            lifeCycle.startWorker(port);
+            lifeCycle.startWorker(port, getDeviceIds());
         }
 
         String modelName = model.getModelName();
         String modelVersion = model.getVersion();
         setState(WorkerState.WORKER_STARTED, HttpURLConnection.HTTP_OK);
-        final CountDownLatch latch = new CountDownLatch(1);
-
+        final int parallelLevel = model.getParallelLevel();
+        final CountDownLatch latch = new CountDownLatch(parallelLevel);
         final int responseBufferSize = configManager.getMaxResponseSize();
         try {
-            Connector connector = new Connector(port);
-            Bootstrap b = new Bootstrap();
-            b.group(backendEventGroup)
-                    .channel(connector.getClientChannel())
-                    .handler(
-                            new ChannelInitializer<Channel>() {
-                                @Override
-                                public void initChannel(Channel ch) {
-                                    ChannelPipeline p = ch.pipeline();
-                                    p.addLast(ENCODER);
-                                    p.addLast(new ModelResponseDecoder(responseBufferSize));
-                                    p.addLast(new WorkerHandler());
-                                }
-                            });
-
-            SocketAddress address = connector.getSocketAddress();
-            logger.info("Connecting to: {}", address);
-            backendChannel = b.connect(address).sync().channel();
-            backendChannel
-                    .closeFuture()
-                    .addListener(
-                            (ChannelFutureListener)
-                                    future -> {
-                                        latch.countDown();
-                                        logger.info(
-                                                "{} Worker disconnected. {}", getWorkerId(), state);
-                                        Thread thread = currentThread.getAndSet(null);
-                                        if (thread != null) {
-                                            thread.interrupt();
-                                        }
-                                    });
-
-            backendChannel
-                    .newSucceededFuture()
-                    .addListener(
-                            (ChannelFutureListener)
-                                    future -> {
-                                        // TODO:
-                                        // use gpu, batch size in load model command
-                                        RequestInput input =
-                                                new RequestInput(UUID.randomUUID().toString());
-                                        if (gpuId >= 0) {
-                                            input.addParameter(
-                                                    new InputParameter(
-                                                            "gpu", String.valueOf(gpuId)));
-                                        }
-
-                                        Job job =
-                                                new RestJob(
-                                                        null,
-                                                        modelName,
-                                                        modelVersion,
-                                                        WorkerCommands.LOAD,
-                                                        input);
-                                        model.addJob(workerId, job);
-                                        latch.countDown();
-                                    });
+            for (int i = 0; i < parallelLevel; i++) {
+                Connector connector = new Connector(port + i);
+                Bootstrap b = new Bootstrap();
+                b.group(backendEventGroup)
+                        .channel(connector.getClientChannel())
+                        .handler(
+                                new ChannelInitializer<Channel>() {
+                                    @Override
+                                    public void initChannel(Channel ch) {
+                                        ChannelPipeline p = ch.pipeline();
+                                        p.addLast(ENCODER);
+                                        p.addLast(new ModelResponseDecoder(responseBufferSize));
+                                        p.addLast(new WorkerHandler());
+                                    }
+                                });
+
+                SocketAddress address = connector.getSocketAddress();
+                logger.info("Connecting to: {}", address);
+                backendChannel.add(b.connect(address).sync().channel());
+                backendChannel
+                        .get(i)
+                        .closeFuture()
+                        .addListener(
+                                (ChannelFutureListener)
+                                        future -> {
+                                            latch.countDown();
+                                            logger.info(
+                                                    "{} Worker disconnected. {}",
+                                                    getWorkerId(),
+                                                    state);
+                                            Thread thread = currentThread.getAndSet(null);
+                                            if (thread != null) {
+                                                thread.interrupt();
+                                            }
+                                        });
+                backendChannel
+                        .get(i)
+                        .newSucceededFuture()
+                        .addListener(
+                                (ChannelFutureListener)
+                                        future -> {
+                                            // TODO:
+                                            // use gpu, batch size in load model command
+                                            if (latch.getCount() == 1) {
+                                                RequestInput input =
+                                                        new RequestInput(
+                                                                UUID.randomUUID().toString());
+                                                if (gpuId >= 0) {
+                                                    input.addParameter(
+                                                            new InputParameter(
+                                                                    "gpu", String.valueOf(gpuId)));
+                                                }
+
+                                                Job job =
+                                                        new RestJob(
+                                                                null,
+                                                                modelName,
+                                                                modelVersion,
+                                                                WorkerCommands.LOAD,
+                                                                input);
+                                                model.addJob(workerId, job);
+                                            }
+                                            latch.countDown();
+                                        });
+            }
 
             if (!latch.await(WORKER_TIMEOUT, TimeUnit.MINUTES)) {
                 throw new WorkerInitializationException(
@@ -377,7 +421,7 @@ public void initChannel(Channel ch) {
             }
             running.set(true);
         } catch (Throwable t) {
-            // https://github.com/netty/netty/issues/2597
+            /* https://github.com/netty/netty/issues/2597 */
             if (t instanceof IOException) {
                 throw new WorkerInitializationException("Failed to connect to worker.", t);
             }
@@ -404,8 +448,10 @@ public int getPid() {
     public void shutdown() {
         running.set(false);
         setState(WorkerState.WORKER_SCALED_DOWN, HttpURLConnection.HTTP_OK);
-        if (backendChannel != null) {
-            backendChannel.close();
+        for (int i = 0; backendChannel.size() > 0 && i < model.getParallelLevel(); i++) {
+            if (backendChannel.get(i) != null) {
+                backendChannel.get(i).close();
+            }
         }
         lifeCycle.terminateIOStreams();
         Thread thread = currentThread.getAndSet(null);
@@ -427,16 +473,33 @@ public void setState(WorkerState newState, int status) {
         listener.notifyChangeState(
                 model.getModelVersionName().getVersionedModelName(), newState, status);
         logger.debug("{} State change {} -> {}", getWorkerName(), state, newState);
-        long timeTaken = System.currentTimeMillis() - startTime;
+        long currentTS = System.currentTimeMillis();
+        long timeTaken = currentTS - startTime;
         if (state != WorkerState.WORKER_SCALED_DOWN) {
             // Don't update the state if it was terminated on purpose.. Scaling in..
             this.state = newState;
         }
+
         if (state == WorkerState.WORKER_MODEL_LOADED) {
-            workerLoadTime.setValue(String.valueOf(timeTaken));
-            workerLoadTime.setTimestamp(
-                    String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
-            loggerTsMetrics.info("{}", workerLoadTime);
+            if (this.workerLoadTimeMetric != null) {
+                try {
+                    this.workerLoadTimeMetric.addOrUpdate(
+                            this.workerLoadTimeMetricDimensionValues, timeTaken);
+                } catch (Exception e) {
+                    logger.error("Failed to update frontend metric WorkerLoadTime: ", e);
+                }
+            }
+            if (recoveryStartTS > 0) {
+                logger.info("Auto recovery succeeded, reset recoveryStartTS");
+                recoveryStartTS = 0;
+            }
+        } else if (state == WorkerState.WORKER_STOPPED) {
+            if (recoveryStartTS == 0) {
+                recoveryStartTS = currentTS;
+                logger.info("Auto recovery start timestamp: {}", recoveryStartTS);
+            } else {
+                logger.warn("Auto recovery failed again");
+            }
         }
     }
 
@@ -451,18 +514,48 @@ public void retry() {
         if (backoffIdx < BACK_OFF.length - 1) {
             ++backoffIdx;
         }
+        this.gpuId = gpuManager.getGPU(this.workerId);
 
         manager.getScheduler()
                 .schedule(() -> manager.submitTask(this), BACK_OFF[backoffIdx], TimeUnit.SECONDS);
         logger.info("Retry worker: {} in {} seconds.", workerId, BACK_OFF[backoffIdx]);
     }
 
+    private String getDeviceIds() {
+        List<Integer> deviceIds;
+        if (gpuId == -1 || model.getParallelLevel() == 1) {
+            return null;
+        } else if (model.isHasCfgDeviceIds()) {
+            return model.getDeviceIds().subList(gpuId, gpuId + model.getParallelLevel()).stream()
+                    .map(String::valueOf)
+                    .collect(Collectors.joining(","));
+        } else {
+            deviceIds = new ArrayList<>(model.getParallelLevel());
+            for (int i = gpuId; i < gpuId + model.getParallelLevel(); i++) {
+                deviceIds.add(i);
+            }
+            return deviceIds.stream().map(String::valueOf).collect(Collectors.joining(","));
+        }
+    }
+
+    public boolean isHealthy() {
+        if (recoveryStartTS == 0
+                || (System.currentTimeMillis() - recoveryStartTS)
+                        < model.getMaxRetryTimeoutInMill()) {
+            return true;
+        }
+        return false;
+    }
+
     @ChannelHandler.Sharable
     private class WorkerHandler extends SimpleChannelInboundHandler<ModelWorkerResponse> {
 
         @Override
         public void channelRead0(ChannelHandlerContext ctx, ModelWorkerResponse msg) {
-            if (!replies.offer(msg)) {
+            try {
+                replies.offer(msg, responseTimeout, TimeUnit.SECONDS);
+            } catch (InterruptedException | NullPointerException e) {
+                logger.error("Failed to offer reply", e);
                 throw new IllegalStateException("Reply queue is full.");
             }
         }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowInferenceRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowInferenceRequestHandler.java
index 1c67f4d2b8..042bef68e1 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowInferenceRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowInferenceRequestHandler.java
@@ -22,6 +22,7 @@
 import org.pytorch.serve.util.NettyUtils;
 import org.pytorch.serve.util.messages.InputParameter;
 import org.pytorch.serve.util.messages.RequestInput;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.pytorch.serve.workflow.WorkflowManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -39,13 +40,46 @@ public class WorkflowInferenceRequestHandler extends HttpRequestHandlerChain {
     /** Creates a new {@code WorkflowInferenceRequestHandler} instance. */
     public WorkflowInferenceRequestHandler() {}
 
+    private static RequestInput parseRequest(ChannelHandlerContext ctx, FullHttpRequest req) {
+        String requestId = NettyUtils.getRequestId(ctx.channel());
+        RequestInput inputData = new RequestInput(requestId);
+
+        CharSequence contentType = HttpUtil.getMimeType(req);
+        for (Map.Entry<String, String> entry : req.headers().entries()) {
+            inputData.updateHeaders(entry.getKey(), entry.getValue());
+        }
+
+        if (HttpPostRequestDecoder.isMultipart(req)
+                || HttpHeaderValues.APPLICATION_X_WWW_FORM_URLENCODED.contentEqualsIgnoreCase(
+                        contentType)) {
+            HttpDataFactory factory =
+                    new DefaultHttpDataFactory(ConfigManager.getInstance().getMaxRequestSize());
+            HttpPostRequestDecoder form = new HttpPostRequestDecoder(factory, req);
+            try {
+                while (form.hasNext()) {
+                    inputData.addParameter(NettyUtils.getFormData(form.next()));
+                }
+            } catch (HttpPostRequestDecoder.EndOfDataDecoderException ignore) {
+                logger.trace("End of multipart items.");
+            } finally {
+                form.cleanFiles();
+                form.destroy();
+            }
+        } else {
+            byte[] content = NettyUtils.getBytes(req.content());
+            inputData.addParameter(new InputParameter("body", content, contentType));
+        }
+        return inputData;
+    }
+
     @Override
     public void handleRequest(
             ChannelHandlerContext ctx,
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
         if ("wfpredict".equalsIgnoreCase(segments[1])) {
             if (segments.length < 3) {
                 throw new ResourceNotFoundException();
@@ -84,36 +118,4 @@ private void sendResponse(ChannelHandlerContext ctx, StatusResponse statusRespon
             }
         }
     }
-
-    private static RequestInput parseRequest(ChannelHandlerContext ctx, FullHttpRequest req) {
-        String requestId = NettyUtils.getRequestId(ctx.channel());
-        RequestInput inputData = new RequestInput(requestId);
-
-        CharSequence contentType = HttpUtil.getMimeType(req);
-        for (Map.Entry<String, String> entry : req.headers().entries()) {
-            inputData.updateHeaders(entry.getKey(), entry.getValue());
-        }
-
-        if (HttpPostRequestDecoder.isMultipart(req)
-                || HttpHeaderValues.APPLICATION_X_WWW_FORM_URLENCODED.contentEqualsIgnoreCase(
-                        contentType)) {
-            HttpDataFactory factory =
-                    new DefaultHttpDataFactory(ConfigManager.getInstance().getMaxRequestSize());
-            HttpPostRequestDecoder form = new HttpPostRequestDecoder(factory, req);
-            try {
-                while (form.hasNext()) {
-                    inputData.addParameter(NettyUtils.getFormData(form.next()));
-                }
-            } catch (HttpPostRequestDecoder.EndOfDataDecoderException ignore) {
-                logger.trace("End of multipart items.");
-            } finally {
-                form.cleanFiles();
-                form.destroy();
-            }
-        } else {
-            byte[] content = NettyUtils.getBytes(req.content());
-            inputData.addParameter(new InputParameter("body", content, contentType));
-        }
-        return inputData;
-    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java
index 3f3e599739..b50f5891b7 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java
@@ -26,6 +26,7 @@
 import org.pytorch.serve.http.StatusResponse;
 import org.pytorch.serve.util.JsonUtils;
 import org.pytorch.serve.util.NettyUtils;
+import org.pytorch.serve.wlm.WorkerInitializationException;
 import org.pytorch.serve.workflow.WorkflowManager;
 import org.pytorch.serve.workflow.messages.DescribeWorkflowResponse;
 import org.pytorch.serve.workflow.messages.ListWorkflowResponse;
@@ -41,13 +42,27 @@ public class WorkflowMgmtRequestHandler extends HttpRequestHandlerChain {
     /** Creates a new {@code WorkflowMgmtRequestHandler} instance. */
     public WorkflowMgmtRequestHandler() {}
 
+    private static DescribeWorkflowResponse createWorkflowResponse(
+            String workflowName, WorkFlow workflow) {
+        DescribeWorkflowResponse response = new DescribeWorkflowResponse();
+        response.setWorkflowName(workflowName);
+        response.setWorkflowUrl(workflow.getWorkflowArchive().getUrl());
+        response.setBatchSize(workflow.getBatchSize());
+        response.setMaxBatchDelay(workflow.getMaxBatchDelay());
+        response.setMaxWorkers(workflow.getMaxWorkers());
+        response.setMinWorkers(workflow.getMinWorkers());
+        response.setWorkflowDag(workflow.getWorkflowDag());
+        return response;
+    }
+
     @Override
     public void handleRequest(
             ChannelHandlerContext ctx,
             FullHttpRequest req,
             QueryStringDecoder decoder,
             String[] segments)
-            throws ModelException, DownloadArchiveException, WorkflowException {
+            throws ModelException, DownloadArchiveException, WorkflowException,
+                    WorkerInitializationException {
         if (isManagementReq(segments)) {
             if (!"workflows".equals(segments[1])) {
                 throw new ResourceNotFoundException();
@@ -194,17 +209,4 @@ private void sendResponse(ChannelHandlerContext ctx, StatusResponse statusRespon
             }
         }
     }
-
-    private static DescribeWorkflowResponse createWorkflowResponse(
-            String workflowName, WorkFlow workflow) {
-        DescribeWorkflowResponse response = new DescribeWorkflowResponse();
-        response.setWorkflowName(workflowName);
-        response.setWorkflowUrl(workflow.getWorkflowArchive().getUrl());
-        response.setBatchSize(workflow.getBatchSize());
-        response.setMaxBatchDelay(workflow.getMaxBatchDelay());
-        response.setMaxWorkers(workflow.getMaxWorkers());
-        response.setMinWorkers(workflow.getMinWorkers());
-        response.setWorkflowDag(workflow.getWorkflowDag());
-        return response;
-    }
 }
diff --git a/frontend/server/src/main/resources/proto/inference.proto b/frontend/server/src/main/resources/proto/inference.proto
index cd9aa89d21..338e36ff21 100644
--- a/frontend/server/src/main/resources/proto/inference.proto
+++ b/frontend/server/src/main/resources/proto/inference.proto
@@ -33,4 +33,7 @@ service InferenceAPIsService {
 
     // Predictions entry point to get inference using default model version.
     rpc Predictions(PredictionsRequest) returns (PredictionResponse) {}
+
+    // Streaming response for an inference request.
+    rpc StreamPredictions(PredictionsRequest) returns (stream PredictionResponse) {}
 }
diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java
index f60dc72cbf..d19b450d99 100644
--- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java
+++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java
@@ -28,6 +28,7 @@
 import java.security.GeneralSecurityException;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 import java.util.regex.Pattern;
 import java.util.stream.IntStream;
 import org.apache.commons.io.FileUtils;
@@ -38,6 +39,7 @@
 import org.pytorch.serve.http.messages.ListModelsResponse;
 import org.pytorch.serve.metrics.Dimension;
 import org.pytorch.serve.metrics.Metric;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.metrics.MetricManager;
 import org.pytorch.serve.servingsdk.impl.PluginsManager;
 import org.pytorch.serve.snapshot.InvalidSnapshotException;
@@ -75,7 +77,9 @@ public void beforeSuite()
                     InvalidSnapshotException {
         ConfigManager.init(new ConfigManager.Arguments());
         configManager = ConfigManager.getInstance();
+        configManager.setProperty("metrics_mode", "prometheus");
         PluginsManager.getInstance().initialize();
+        MetricCache.init();
 
         InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE);
 
@@ -1873,6 +1877,14 @@ public void testScaleModelFailure() throws InterruptedException, SkipException {
         Assert.assertEquals(resp.getCode(), HttpResponseStatus.INTERNAL_SERVER_ERROR.code());
         Assert.assertEquals(
                 resp.getMessage(), "Failed to start workers for model init-error version: null");
+
+        TestUtils.ping(configManager);
+        TestUtils.getLatch().await();
+        // There is a retry time window. To reduce CI latency,
+        // it is fine for ping to either 200 or 500.
+        Assert.assertTrue(
+                TestUtils.getHttpStatus().equals(HttpResponseStatus.INTERNAL_SERVER_ERROR)
+                        || TestUtils.getHttpStatus().equals(HttpResponseStatus.OK));
     }
 
     @Test(
@@ -2080,8 +2092,38 @@ public void testUnregisterModelFailure() throws InterruptedException, SkipExcept
 
         channel = TestUtils.connect(ConnectorType.MANAGEMENT_CONNECTOR, configManager);
         Assert.assertNotNull(channel);
+        TestUtils.setResult(null);
+        TestUtils.setLatch(new CountDownLatch(1));
         TestUtils.unregisterModel(channel, "noopversioned", "1.11", false);
+        TestUtils.getLatch().await();
+
+        TestUtils.setResult(null);
+        TestUtils.setLatch(new CountDownLatch(1));
         TestUtils.unregisterModel(channel, "noopversioned", "1.2.1", false);
+        TestUtils.getLatch().await();
+    }
+
+    @Test(
+            alwaysRun = true,
+            dependsOnMethods = {"testUnregisterModelFailure"})
+    public void testClientTimeout() throws InterruptedException {
+        Channel mgmtChannel = TestUtils.connect(ConnectorType.MANAGEMENT_CONNECTOR, configManager);
+        loadTests(mgmtChannel, "echo-client-timeout.mar", "echo-client-timeout");
+
+        Channel inferChannel = TestUtils.connect(ConnectorType.INFERENCE_CONNECTOR, configManager);
+        TestUtils.setResult(null);
+        TestUtils.setLatch(new CountDownLatch(1));
+        DefaultFullHttpRequest req =
+                new DefaultFullHttpRequest(
+                        HttpVersion.HTTP_1_1, HttpMethod.POST, "/predictions/echo-client-timeout");
+        req.content().writeZero(10385760);
+        HttpUtil.setContentLength(req, req.content().readableBytes());
+        req.headers().set(HttpHeaderNames.CONTENT_TYPE, HttpHeaderValues.APPLICATION_OCTET_STREAM);
+        inferChannel.writeAndFlush(req);
+        TestUtils.getLatch().await(1, TimeUnit.SECONDS);
+        Assert.assertNull(TestUtils.result);
+
+        unloadTests(mgmtChannel, "echo-client-timeout");
     }
 
     private void testLoadModel(String url, String modelName, String version)
diff --git a/frontend/server/src/test/java/org/pytorch/serve/SnapshotTest.java b/frontend/server/src/test/java/org/pytorch/serve/SnapshotTest.java
index 1b28e26909..1f7f678d99 100644
--- a/frontend/server/src/test/java/org/pytorch/serve/SnapshotTest.java
+++ b/frontend/server/src/test/java/org/pytorch/serve/SnapshotTest.java
@@ -27,6 +27,7 @@
 import java.util.Properties;
 import java.util.concurrent.CountDownLatch;
 import org.apache.commons.io.FileUtils;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.servingsdk.impl.PluginsManager;
 import org.pytorch.serve.servingsdk.snapshot.Snapshot;
 import org.pytorch.serve.snapshot.InvalidSnapshotException;
@@ -59,6 +60,7 @@ public void beforeSuite()
         ConfigManager.init(new ConfigManager.Arguments());
         configManager = ConfigManager.getInstance();
         PluginsManager.getInstance().initialize();
+        MetricCache.init();
 
         InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE);
         configManager.setInitialWorkerPort(9500);
@@ -267,6 +269,7 @@ public void testStartTorchServeWithLastSnapshot()
         System.setProperty("tsConfigFile", "");
         ConfigManager.init(new ConfigManager.Arguments());
         configManager = ConfigManager.getInstance();
+        MetricCache.init();
         server = new ModelServer(configManager);
         server.startRESTserver();
         Channel channel = null;
@@ -292,6 +295,7 @@ public void testRestartTorchServeWithSnapshotAsConfig()
         System.setProperty("tsConfigFile", getLastSnapshot());
         ConfigManager.init(new ConfigManager.Arguments());
         configManager = ConfigManager.getInstance();
+        MetricCache.init();
         server = new ModelServer(configManager);
         server.startRESTserver();
         Channel channel = null;
diff --git a/frontend/server/src/test/java/org/pytorch/serve/TestUtils.java b/frontend/server/src/test/java/org/pytorch/serve/TestUtils.java
index 90a70ecb65..e4be893019 100644
--- a/frontend/server/src/test/java/org/pytorch/serve/TestUtils.java
+++ b/frontend/server/src/test/java/org/pytorch/serve/TestUtils.java
@@ -50,9 +50,9 @@ public final class TestUtils {
     private static Channel metricsChannel;
     private static String tsInferLatencyPattern =
             "ts_inference_latency_microseconds\\{"
-                    + "uuid=\"[\\w]{8}(-[\\w]{4}){3}-[\\w]{12}\","
                     + "model_name=\"%s\","
-                    + "model_version=\"%s\",\\}\\s\\d+(\\.\\d+)";
+                    + "model_version=\"%s\","
+                    + "hostname=\".+\",\\}\\s\\d+(\\.\\d+)";
 
     private TestUtils() {}
 
diff --git a/frontend/server/src/test/java/org/pytorch/serve/WorkflowTest.java b/frontend/server/src/test/java/org/pytorch/serve/WorkflowTest.java
index 0ac43eb65d..68a025065e 100644
--- a/frontend/server/src/test/java/org/pytorch/serve/WorkflowTest.java
+++ b/frontend/server/src/test/java/org/pytorch/serve/WorkflowTest.java
@@ -19,6 +19,7 @@
 import org.apache.commons.io.FileUtils;
 import org.pytorch.serve.http.ErrorResponse;
 import org.pytorch.serve.http.StatusResponse;
+import org.pytorch.serve.metrics.MetricCache;
 import org.pytorch.serve.servingsdk.impl.PluginsManager;
 import org.pytorch.serve.snapshot.InvalidSnapshotException;
 import org.pytorch.serve.util.ConfigManager;
@@ -51,6 +52,7 @@ public void beforeClass()
         ConfigManager.init(new ConfigManager.Arguments());
         configManager = ConfigManager.getInstance();
         PluginsManager.getInstance().initialize();
+        MetricCache.init();
 
         InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE);
         configManager.setInitialWorkerPort(10000);
diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java
new file mode 100644
index 0000000000..7170757287
--- /dev/null
+++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricCacheTest.java
@@ -0,0 +1,63 @@
+package org.pytorch.serve.metrics;
+
+import org.pytorch.serve.metrics.format.prometheous.PrometheusCounter;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusGauge;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class MetricCacheTest {
+    @Test
+    public void testMetricCacheLoadValidConfiguration() {
+        MetricCache metricCache = MetricCache.getInstance();
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("Requests2XX").getClass(), PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("Requests4XX").getClass(), PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("Requests5XX").getClass(), PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("ts_inference_requests_total").getClass(),
+                PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("ts_inference_latency_microseconds").getClass(),
+                PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("ts_queue_latency_microseconds").getClass(),
+                PrometheusCounter.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("QueueTime").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("WorkerThreadTime").getClass(),
+                PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("WorkerLoadTime").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("CPUUtilization").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("MemoryUsed").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("MemoryAvailable").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("MemoryUtilization").getClass(),
+                PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("DiskUsage").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("DiskUtilization").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("DiskAvailable").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("GPUMemoryUtilization").getClass(),
+                PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("GPUMemoryUsed").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricFrontend("GPUUtilization").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(metricCache.getMetricFrontend("InvalidMetric"), null);
+        Assert.assertEquals(
+                metricCache.getMetricBackend("HandlerTime").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(
+                metricCache.getMetricBackend("PredictionTime").getClass(), PrometheusGauge.class);
+        Assert.assertEquals(metricCache.getMetricBackend("InvalidMetric"), null);
+    }
+}
diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java
new file mode 100644
index 0000000000..ba8b5ca58a
--- /dev/null
+++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricConfigurationTest.java
@@ -0,0 +1,136 @@
+package org.pytorch.serve.metrics.configuration;
+
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import org.pytorch.serve.util.ConfigManager;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import org.yaml.snakeyaml.composer.ComposerException;
+
+public class MetricConfigurationTest {
+    @Test
+    public void testLoadValidConfiguration()
+            throws FileNotFoundException, ComposerException, RuntimeException {
+        MetricConfiguration config =
+                MetricConfiguration.loadConfiguration(
+                        "src/test/resources/metrics/valid_configuration.yaml");
+
+        Assert.assertEquals(
+                config.getDimensions(),
+                new ArrayList<String>(
+                        Arrays.asList("ModelName", "ModelVersion", "Level", "Hostname")));
+
+        Assert.assertEquals(config.getTs_metrics().getCounter().size(), 2);
+
+        MetricSpecification spec = config.getTs_metrics().getCounter().get(0);
+        Assert.assertEquals(spec.getName(), "Requests2XX");
+        Assert.assertEquals(spec.getUnit(), "Count");
+        Assert.assertEquals(
+                spec.getDimensions(), new ArrayList<String>(Arrays.asList("Level", "Hostname")));
+
+        spec = config.getTs_metrics().getCounter().get(1);
+        Assert.assertEquals(spec.getName(), "InferenceRequestsTotal");
+        Assert.assertEquals(spec.getUnit(), "Count");
+        Assert.assertEquals(
+                spec.getDimensions(),
+                new ArrayList<String>(Arrays.asList("ModelName", "ModelVersion", "Hostname")));
+
+        Assert.assertEquals(config.getTs_metrics().getGauge().size(), 2);
+
+        spec = config.getTs_metrics().getGauge().get(0);
+        Assert.assertEquals(spec.getName(), "QueueTime");
+        Assert.assertEquals(spec.getUnit(), "Milliseconds");
+        Assert.assertEquals(
+                spec.getDimensions(), new ArrayList<String>(Arrays.asList("Level", "Hostname")));
+
+        spec = config.getTs_metrics().getGauge().get(1);
+        Assert.assertEquals(spec.getName(), "WorkerThreadTime");
+        Assert.assertEquals(spec.getUnit(), "Milliseconds");
+        Assert.assertEquals(
+                spec.getDimensions(), new ArrayList<String>(Arrays.asList("Level", "Hostname")));
+
+        Assert.assertEquals(config.getTs_metrics().getHistogram(), null);
+
+        Assert.assertEquals(config.getModel_metrics().getCounter(), null);
+
+        Assert.assertEquals(config.getModel_metrics().getGauge().size(), 2);
+
+        spec = config.getModel_metrics().getGauge().get(0);
+        Assert.assertEquals(spec.getName(), "HandlerTime");
+        Assert.assertEquals(spec.getUnit(), "ms");
+        Assert.assertEquals(
+                spec.getDimensions(),
+                new ArrayList<String>(Arrays.asList("ModelName", "Level", "Hostname")));
+
+        spec = config.getModel_metrics().getGauge().get(1);
+        Assert.assertEquals(spec.getName(), "PredictionTime");
+        Assert.assertEquals(spec.getUnit(), "ms");
+        Assert.assertEquals(
+                spec.getDimensions(),
+                new ArrayList<String>(Arrays.asList("ModelName", "Level", "Hostname")));
+
+        Assert.assertEquals(config.getModel_metrics().getHistogram(), null);
+    }
+
+    @Test
+    public void testLoadValidConfigurationEmptyMetricDimensions()
+            throws FileNotFoundException, ComposerException, RuntimeException {
+        MetricConfiguration config =
+                MetricConfiguration.loadConfiguration(
+                        "src/test/resources/metrics/valid_configuration_empty_metric_dimensions.yaml");
+
+        Assert.assertEquals(config.getDimensions(), null);
+
+        Assert.assertEquals(config.getTs_metrics().getCounter().size(), 1);
+
+        MetricSpecification spec = config.getTs_metrics().getCounter().get(0);
+        Assert.assertEquals(spec.getName(), "InferenceRequestsTotal");
+        Assert.assertEquals(spec.getUnit(), "Count");
+        Assert.assertEquals(spec.getDimensions(), null);
+
+        Assert.assertEquals(config.getTs_metrics().getGauge(), null);
+
+        Assert.assertEquals(config.getTs_metrics().getHistogram(), null);
+
+        Assert.assertEquals(config.getModel_metrics(), null);
+    }
+
+    @Test
+    public void testLoadInvalidConfigurationMissingDimension() {
+        Assert.assertThrows(
+                ComposerException.class,
+                () ->
+                        MetricConfiguration.loadConfiguration(
+                                "src/test/resources/metrics/invalid_configuration_missing_dimension.yaml"));
+    }
+
+    @Test
+    public void testLoadInvalidConfigurationMissingMetricName() {
+        Assert.assertThrows(
+                RuntimeException.class,
+                () ->
+                        MetricConfiguration.loadConfiguration(
+                                "src/test/resources/metrics/invalid_configuration_missing_metric_name.yaml"));
+    }
+
+    @Test
+    public void testLoadInvalidConfigurationMissingMetricUnit() {
+        Assert.assertThrows(
+                RuntimeException.class,
+                () ->
+                        MetricConfiguration.loadConfiguration(
+                                "src/test/resources/metrics/invalid_configuration_missing_metric_unit.yaml"));
+    }
+
+    @Test
+    public void testMetricsModeConfiguration() {
+        ConfigManager configManager = ConfigManager.getInstance();
+        String existingMetricsModeConfiguration = configManager.getMetricsMode();
+        Assert.assertEquals(existingMetricsModeConfiguration, "log");
+        configManager.setProperty("metrics_mode", "test_metrics_mode");
+        Assert.assertEquals(configManager.getMetricsMode(), "test_metrics_mode");
+        // Restore original metrics mode configuration
+        configManager.setProperty("metrics_mode", existingMetricsModeConfiguration);
+    }
+}
diff --git a/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java
new file mode 100644
index 0000000000..5591e93910
--- /dev/null
+++ b/frontend/server/src/test/java/org/pytorch/serve/metrics/MetricTest.java
@@ -0,0 +1,269 @@
+package org.pytorch.serve.metrics;
+
+import io.prometheus.client.CollectorRegistry;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.core.Logger;
+import org.apache.logging.log4j.core.appender.WriterAppender;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusCounter;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusGauge;
+import org.pytorch.serve.metrics.format.prometheous.PrometheusHistogram;
+import org.pytorch.serve.util.ConfigManager;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+@Test(singleThreaded = true)
+public class MetricTest {
+    private final String testMetricName = "TestMetric";
+    private final String testMetricUnit = "ms";
+    private final ArrayList<String> testMetricDimensionNames =
+            new ArrayList<String>(Arrays.asList("ModelName", "Level", "Hostname"));
+    private final ArrayList<String> testMetricDimensionValues =
+            new ArrayList<String>(Arrays.asList("TestModel", "Model", "TestHost"));
+    private final String testRequestId = "fa8639a8-d3fa-4a25-a80f-24463863fe0f";
+    private final Logger loggerModelMetrics =
+            (org.apache.logging.log4j.core.Logger)
+                    LogManager.getLogger(ConfigManager.MODEL_METRICS_LOGGER);
+    private final Logger loggerTsMetrics =
+            (org.apache.logging.log4j.core.Logger)
+                    LogManager.getLogger(ConfigManager.MODEL_SERVER_METRICS_LOGGER);
+    private final String modelMetricsAppenderName = "ModelMetricsAppender";
+    private final String tsMetricsAppenderName = "TsMetricsAppender";
+    private final StringWriter modelMetricsContent = new StringWriter();
+    private final StringWriter tsMetricsContent = new StringWriter();
+    private final WriterAppender modelMetricsAppender =
+            WriterAppender.createAppender(
+                    null, null, modelMetricsContent, modelMetricsAppenderName, true, false);
+    private final WriterAppender tsMetricsAppender =
+            WriterAppender.createAppender(
+                    null, null, tsMetricsContent, tsMetricsAppenderName, true, false);
+
+    @BeforeClass
+    public void registerMetricLogAppenders() {
+        loggerModelMetrics.addAppender(modelMetricsAppender);
+        modelMetricsAppender.start();
+        loggerTsMetrics.addAppender(tsMetricsAppender);
+        tsMetricsAppender.start();
+    }
+
+    @BeforeMethod
+    public void flushLogWriterStreams() {
+        modelMetricsContent.flush();
+        tsMetricsContent.flush();
+    }
+
+    @BeforeMethod
+    public void clearPrometheusRegistry() {
+        CollectorRegistry.defaultRegistry.clear();
+    }
+
+    @AfterClass
+    public void unregisterMetricLogAppenders() {
+        modelMetricsAppender.stop();
+        loggerModelMetrics.removeAppender(modelMetricsAppender);
+        tsMetricsAppender.stop();
+        loggerTsMetrics.removeAppender(tsMetricsAppender);
+    }
+
+    @Test
+    public void testBackendLogMetric() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.LOG,
+                        MetricBuilder.MetricType.COUNTER,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), LogMetric.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 1.0);
+        String expectedMetricString =
+                "TestMetric.ms:1.0|#ModelName:TestModel,Level:Model|#hostname:TestHost,"
+                        + "requestID:fa8639a8-d3fa-4a25-a80f-24463863fe0f,timestamp:";
+        Assert.assertTrue(modelMetricsContent.toString().contains(expectedMetricString));
+    }
+
+    @Test
+    public void testFrontendLogMetric() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.LOG,
+                        MetricBuilder.MetricType.GAUGE,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), LogMetric.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, 1.0);
+        String expectedMetricString =
+                "TestMetric.ms:1.0|#ModelName:TestModel,Level:Model|#hostname:TestHost,timestamp:";
+        Assert.assertTrue(tsMetricsContent.toString().contains(expectedMetricString));
+    }
+
+    @Test
+    public void testBackendPrometheusCounter() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.COUNTER,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusCounter.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(3.0));
+    }
+
+    @Test
+    public void testFrontendPrometheusCounter() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.COUNTER,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusCounter.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(3.0));
+    }
+
+    @Test
+    public void testBackendPrometheusGauge() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.GAUGE,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusGauge.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(2.0));
+    }
+
+    @Test
+    public void testFrontendPrometheusGauge() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.GAUGE,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusGauge.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName,
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(2.0));
+    }
+
+    @Test
+    public void testBackendPrometheusHistogram() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.HISTOGRAM,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusHistogram.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName + "_sum",
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, testRequestId, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName + "_sum",
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(3.0));
+    }
+
+    @Test
+    public void testFrontendPrometheusHistogram() {
+        IMetric testMetric =
+                MetricBuilder.build(
+                        MetricBuilder.MetricMode.PROMETHEUS,
+                        MetricBuilder.MetricType.HISTOGRAM,
+                        testMetricName,
+                        testMetricUnit,
+                        testMetricDimensionNames);
+        Assert.assertEquals(testMetric.getClass(), PrometheusHistogram.class);
+
+        testMetric.addOrUpdate(testMetricDimensionValues, 1.0);
+        Double metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName + "_sum",
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(1.0));
+        testMetric.addOrUpdate(testMetricDimensionValues, 2.0);
+        metricValue =
+                CollectorRegistry.defaultRegistry.getSampleValue(
+                        testMetricName + "_sum",
+                        testMetricDimensionNames.toArray(new String[0]),
+                        testMetricDimensionValues.toArray(new String[0]));
+        Assert.assertEquals(metricValue, Double.valueOf(3.0));
+    }
+}
diff --git a/frontend/server/src/test/resources/config.properties b/frontend/server/src/test/resources/config.properties
index 233c73349f..7b8f3e29a2 100644
--- a/frontend/server/src/test/resources/config.properties
+++ b/frontend/server/src/test/resources/config.properties
@@ -46,4 +46,3 @@ models={\
 # install_py_dep_per_model=false
 # enable_metrics_api=false
 workflow_store=../archive/src/test/resources/workflows
-metrics_config=src/test/resources/metrics_default.yaml
diff --git a/frontend/server/src/test/resources/config_test_env.properties b/frontend/server/src/test/resources/config_test_env.properties
index a9746782ab..7e58e903b2 100644
--- a/frontend/server/src/test/resources/config_test_env.properties
+++ b/frontend/server/src/test/resources/config_test_env.properties
@@ -10,6 +10,7 @@ load_models=noop-v0.1,noop-v1.0
 # netty_client_threads=0
 # default_workers_per_model=0
 # job_queue_size=100
+# n_priorities=1
 async_logging=true
 default_response_timeout=120
 unregister_model_timeout=120
diff --git a/frontend/server/src/test/resources/metrics/invalid_configuration_missing_dimension.yaml b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_dimension.yaml
new file mode 100644
index 0000000000..680e8cd037
--- /dev/null
+++ b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_dimension.yaml
@@ -0,0 +1,9 @@
+dimensions:
+  - &model_name "ModelName"
+  - &model_version "ModelVersion"
+
+ts_metrics:
+  counter:
+    - name: InferenceRequestsTotal
+      unit: Count
+      dimensions: [*model_name, *model_version, *hostname]
diff --git a/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_name.yaml b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_name.yaml
new file mode 100644
index 0000000000..8c064e6140
--- /dev/null
+++ b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_name.yaml
@@ -0,0 +1,9 @@
+dimensions:
+  - &model_name "ModelName"
+  - &model_version "ModelVersion"
+  - &hostname "Hostname"
+
+ts_metrics:
+  counter:
+    - unit: Count
+      dimensions: [*model_name, *model_version, *hostname]
diff --git a/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_unit.yaml b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_unit.yaml
new file mode 100644
index 0000000000..9b76ed2363
--- /dev/null
+++ b/frontend/server/src/test/resources/metrics/invalid_configuration_missing_metric_unit.yaml
@@ -0,0 +1,9 @@
+dimensions:
+  - &model_name "ModelName"
+  - &model_version "ModelVersion"
+  - &hostname "Hostname"
+
+ts_metrics:
+  counter:
+    - name: InferenceRequestsTotal
+      dimensions: [*model_name, *model_version, *hostname]
diff --git a/frontend/server/src/test/resources/metrics/valid_configuration.yaml b/frontend/server/src/test/resources/metrics/valid_configuration.yaml
new file mode 100644
index 0000000000..f225a04102
--- /dev/null
+++ b/frontend/server/src/test/resources/metrics/valid_configuration.yaml
@@ -0,0 +1,30 @@
+dimensions:
+  - &model_name "ModelName"
+  - &model_version "ModelVersion"
+  - &level "Level"
+  - &hostname "Hostname"
+
+ts_metrics:
+  counter:
+    - name: Requests2XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: InferenceRequestsTotal
+      unit: Count
+      dimensions: [*model_name, *model_version, *hostname]
+  gauge:
+    - name: QueueTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+    - name: WorkerThreadTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+
+model_metrics:
+  gauge:
+    - name: HandlerTime
+      unit: ms
+      dimensions: [*model_name, *level]
+    - name: PredictionTime
+      unit: ms
+      dimensions: [*model_name, *level]
diff --git a/frontend/server/src/test/resources/metrics/valid_configuration_empty_metric_dimensions.yaml b/frontend/server/src/test/resources/metrics/valid_configuration_empty_metric_dimensions.yaml
new file mode 100644
index 0000000000..f338c5f48a
--- /dev/null
+++ b/frontend/server/src/test/resources/metrics/valid_configuration_empty_metric_dimensions.yaml
@@ -0,0 +1,4 @@
+ts_metrics:
+  counter:
+    - name: InferenceRequestsTotal
+      unit: Count
diff --git a/frontend/server/testng.xml b/frontend/server/testng.xml
index de8a0d574c..2518798907 100644
--- a/frontend/server/testng.xml
+++ b/frontend/server/testng.xml
@@ -1,5 +1,5 @@
 <!DOCTYPE suite SYSTEM "https://testng.org/testng-1.0.dtd" >
-  
+
 <suite name="TorchServeSuite" verbose="1" >
   <test name="TorchServe">
     <classes>
@@ -9,6 +9,9 @@
       <class name="org.pytorch.serve.ModelServerTest"/>
       <class name="org.pytorch.serve.SnapshotTest"/>
       <class name="org.pytorch.serve.WorkflowTest"/>
+      <class name="org.pytorch.serve.metrics.configuration.MetricConfigurationTest"/>
+      <class name="org.pytorch.serve.metrics.MetricTest"/>
+      <class name="org.pytorch.serve.metrics.MetricCacheTest"/>
     </classes>
   </test>
 </suite>
diff --git a/kubernetes/AKS/README.md b/kubernetes/AKS/README.md
index c2179e8cef..7f1c7ca32e 100644
--- a/kubernetes/AKS/README.md
+++ b/kubernetes/AKS/README.md
@@ -26,7 +26,7 @@ Use the [az aks create](https://docs.microsoft.com/en-us/cli/azure/aks?view=azur
 
 #### 1.4 Connect to the cluster
 
-To manage a Kubernetes cluster, you use [kubectl](https://kubernetes.io/docs/user-guide/kubectl/), the Kubernetes command-line client. If you use Azure Cloud Shell, `kubectl` is already installed. To install `kubectl` locally, use the [az aks install-cli](https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest#az-aks-install-cli) command:
+To manage a Kubernetes cluster, you use [kubectl](https://kubernetes.io/docs/reference/kubectl/), the Kubernetes command-line client. If you use Azure Cloud Shell, `kubectl` is already installed. To install `kubectl` locally, use the [az aks install-cli](https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest#az-aks-install-cli) command:
 
 ```az aks install-cli```
 
diff --git a/kubernetes/AKS/config.properties b/kubernetes/AKS/config.properties
index 9f7ad861a7..e6003a92c8 100644
--- a/kubernetes/AKS/config.properties
+++ b/kubernetes/AKS/config.properties
@@ -2,7 +2,7 @@ inference_address=http://0.0.0.0:8080
 management_address=http://0.0.0.0:8081
 metrics_address=http://0.0.0.0:8082
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_gpu=1
 number_of_netty_threads=32
diff --git a/kubernetes/GKE/README.md b/kubernetes/GKE/README.md
index fccc532068..9be81963f9 100644
--- a/kubernetes/GKE/README.md
+++ b/kubernetes/GKE/README.md
@@ -59,7 +59,7 @@ gcloud container clusters create torchserve --machine-type n1-standard-4 --accel
 
 #### 1.3 Connect to the cluster
 
-To manage a Kubernetes cluster, you use [kubectl](https://kubernetes.io/docs/user-guide/kubectl/), the Kubernetes command-line client. If you use GKE Cloud Shell, `kubectl` is already installed. To install `kubectl` locally, use the [gcloud components install](https://kubernetes.io/docs/tasks/tools/install-kubectl/) command:
+To manage a Kubernetes cluster, you use [kubectl](https://kubernetes.io/docs/reference/kubectl/), the Kubernetes command-line client. If you use GKE Cloud Shell, `kubectl` is already installed. To install `kubectl` locally, use the [gcloud components install](https://kubernetes.io/docs/tasks/tools/install-kubectl/) command:
 
 Below command require Cloud SDK component manager enabled.
 
diff --git a/kubernetes/kserve/README.md b/kubernetes/kserve/README.md
index d040ed7d63..c35cd2cabf 100644
--- a/kubernetes/kserve/README.md
+++ b/kubernetes/kserve/README.md
@@ -117,7 +117,7 @@ grpc_management_port=7071
 enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
@@ -189,7 +189,7 @@ Refer link for more [examples](https://github.com/kserve/kserve/tree/master/docs
 
 KServe supports different types of inputs (ex: tensor, bytes). Use the following instructions to generate input files based on its type.
 
-[MNIST input generation](kf_request_json/v2/mnist/README.md##-Preparing-input) 
+[MNIST input generation](kf_request_json/v2/mnist/README.md##-Preparing-input)
 [Bert input generation](kf_request_json/v2/bert/README.md##-Preparing-input)
 
 
@@ -233,7 +233,7 @@ Refer the individual readmes for KServe :
 * [BERT](https://github.com/kserve/kserve/blob/master/docs/samples/v1beta1/custom/torchserve/bert-sample/hugging-face-bert-sample.md)
 * [MNIST](https://github.com/kserve/kserve/blob/master/docs/samples/v1beta1/torchserve/README.md)
 
-Sample input JSON file for v1 and v2 protocols 
+Sample input JSON file for v1 and v2 protocols
 
 For v1 protocol
 
diff --git a/kubernetes/kserve/config.properties b/kubernetes/kserve/config.properties
index 7c9c33589b..422e53d138 100644
--- a/kubernetes/kserve/config.properties
+++ b/kubernetes/kserve/config.properties
@@ -7,7 +7,7 @@ grpc_management_port=7071
 enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
diff --git a/kubernetes/kserve/developer_guide.md b/kubernetes/kserve/developer_guide.md
index 1692db53ec..5a3281ebc6 100644
--- a/kubernetes/kserve/developer_guide.md
+++ b/kubernetes/kserve/developer_guide.md
@@ -33,7 +33,7 @@ grpc_management_port=7071
 enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
diff --git a/kubernetes/kserve/image_transformer/README.md b/kubernetes/kserve/image_transformer/README.md
index 05d3a2d3be..733d71aee3 100644
--- a/kubernetes/kserve/image_transformer/README.md
+++ b/kubernetes/kserve/image_transformer/README.md
@@ -86,7 +86,7 @@ inference_address=http://0.0.0.0:8085
 management_address=http://0.0.0.0:8085
 metrics_address=http://0.0.0.0:8082
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
diff --git a/kubernetes/kserve/kf_request_json/v2/bert/README.md b/kubernetes/kserve/kf_request_json/v2/bert/README.md
index 8f157f57ec..333a2cdcc7 100644
--- a/kubernetes/kserve/kf_request_json/v2/bert/README.md
+++ b/kubernetes/kserve/kf_request_json/v2/bert/README.md
@@ -16,9 +16,9 @@ Run the following command to download the model
 ```
 python Download_Transformer_models.py
 ```
- 
+
 ### Generate mar file
- 
+
 ```bash
 torch-model-archiver --model-name BERTSeqClassification --version 1.0 \
 --serialized-file Transformer_model/pytorch_model.bin \
@@ -33,7 +33,7 @@ Move the mar file to model-store
 ```
 sudo mv BERTSeqClassification.mar /mnt/models/model-store
 ```
- 
+
 and use the following config properties (`/mnt/models/config`)
 
 ```
@@ -44,7 +44,7 @@ enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
 service_envelope=kservev2
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
@@ -58,7 +58,7 @@ Use [bert_bytes_v2.json](bert_bytes_v2.json) or [bert_tensor_v2](bert_tensor_v2.
 
 For new sample text, follow the instructions below
 
-For bytes input, use [tobytes](tobytes.py) utility. 
+For bytes input, use [tobytes](tobytes.py) utility.
 
 ```
 python tobytes.py --input_text "this year business is good"
@@ -118,4 +118,4 @@ curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/BERTS
 Expected output
 ```bash
 {"id": "33abc661-7265-42fc-b7d9-44e5f79a7a67", "model_name": "BERTSeqClassification", "model_version": "1.0", "outputs": [{"name": "predict", "shape": [], "datatype": "BYTES", "data": ["Not Accepted"]}]}
-```
\ No newline at end of file
+```
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/README.md b/kubernetes/kserve/kf_request_json/v2/mnist/README.md
index 9fa03a3a3e..dcfcd1bd2b 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/README.md
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/README.md
@@ -7,7 +7,7 @@ model locally using kserve.
 
 Clone [pytorch/serve](https://github.com/pytorch/serve) repository
 navigate to `examples/image_classifier/mnist`
-  
+
 ```bash
 torch-model-archiver --model-name mnist --version 1.0 \
 --model-file mnist.py \
@@ -17,15 +17,15 @@ torch-model-archiver --model-name mnist --version 1.0 \
 
 The command will create `mnist.mar` file in current directory
 
-Move the mar file to model-store 
+Move the mar file to model-store
 
-```
+```bash
 sudo mv mnist.mar /mnt/models/model-store
 ```
 
 and use the following config properties (`/mnt/models/config`)
 
-```
+```conf
 inference_address=http://0.0.0.0:8085
 management_address=http://0.0.0.0:8085
 metrics_address=http://0.0.0.0:8082
@@ -33,7 +33,7 @@ enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
 service_envelope=kservev2
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
@@ -49,15 +49,15 @@ For generating input for a new image follow the instructions given below
 
 Move to `kubernetes/kserve/kf_request_json/v2/mnist`
 
-For bytes input, use [tobytes](tobytes.py) utility. 
+For bytes input, use [tobytes](tobytes.py) utility.
 
-```
+```bash
 python tobytes.py 0.png
 ```
 
 For tensor input, use [totensor](totensor.py) utility
 
-```
+```bash
 python totensor.py 0.png
 ```
 
@@ -66,7 +66,7 @@ python totensor.py 0.png
 
 Start TorchServe
 
-```
+```bash
 torchserve --start --ts-config /mnt/models/config/config.properties --ncs
 ```
 
@@ -74,7 +74,7 @@ To test locally, clone TorchServe and move to the following folder `kubernetes/k
 
 Start Kserve
 
-```
+```bash
 python __main__.py
 ```
 
@@ -85,12 +85,12 @@ Navigate to `kubernetes/kserve/kf_request_json/v2/mnist`
 Run the following command
 
 ```bash
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_bytes.json
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_bytes.json
 ```
 
 Expected Output
 
-```bash
+```json
 {"id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298", "model_name": "mnist", "model_version": "1.0", "outputs": [{"name": "predict", "shape": [1], "datatype": "INT64", "data": [0]}]}
 ```
 
@@ -100,8 +100,8 @@ Expected Output
 
 Run the following command
 
-```
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_tensor.json
+```bash
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_tensor.json
 ```
 
 Expected output
@@ -115,10 +115,11 @@ Expected output
 Run the following command
 
 ```bash
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/explain -d @./mnist_v2_bytes.json
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/explain -d @./mnist_v2_bytes.json
 ```
 
 Expected output
-```bash
+
+```json
 {"id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298", "model_name": "mnist", "model_version": "1.0", "outputs": [{"name": "explain", "shape": [1, 28, 28], "datatype": "FP64", "data": [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0040547529196303285, -0.000226128774499257, -0.00012734138382422276, 0.005648369544853077, 0.0089047843954152, 0.002638536593970295, 0.002680245911942565, -0.0026578015819202173, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00024465771891337887, 0.0008218450954311162, 0.01528591767842519, 0.007512832335428859, 0.00709498458333515, 0.0034056686436576803, -0.002091925041823873, -0.0007800293875604465, 0.02299587827540853, 0.019004329367380418, -0.0012529559050418735, -0.0014666116646934577, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005298396405518712, -0.0007901605729004231, 0.0039060659926479398, 0.023174082126728335, 0.01723791770922474, 0.010867034167828598, 0.003001563229273835, 0.00622421771715703, 0.006120712207087491, 0.01673632965122119, 0.005674718948781803, 0.004344134599735745, -0.0012328422311881568, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0006867353833785289, 0.009772899792600862, -0.0038754932221901437, 0.001798693579973005, 0.001307544047675232, -0.0024510981010352315, -0.0008806773488194292, -0.0, -0.0, -0.00014277890760828639, -0.009322313235257151, 0.020608317727589167, 0.004351394518148479, -0.0007875566214137449, -0.0009075897508410689, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00022247238084657642, -0.0007829029819622099, 0.0026663695200516055, 0.0009733366691924418, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0004323207980879993, 0.023657171939959983, 0.01069484496100618, -0.0023759529165659743, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.002074797197335781, -0.002320101263777886, -0.001289920656543141, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.007629679763806616, 0.01044862710854819, 0.00025032875474040415, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0003770836745884539, -0.005156369309364184, 0.0012477582083019567, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -4.442513564501309e-05, 0.010248046436803096, 0.0009971133914441863, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0004501048922351147, -0.00196305355861066, -0.0006664792277975681, 0.0020157403871024866, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.002214456978582924, 0.008361583668963536, 0.0031401942747203444, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0028943545250037983, -0.0031301382844878753, 0.002113252994616467, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0010321050071136991, 0.008905753948020954, 0.0028464383724280478, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0053052889804602885, -0.0019271100770928186, 0.0012090042664300153, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0011945155805738324, 0.005654442809865844, 0.0020132075147173286, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0014689358119857122, 0.0010743412654248086, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0017047980433136346, 0.0029066051664685937, -0.0007805868937027288, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 5.541726090138969e-05, 0.0014516115182299915, 0.0002827700518397855, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.001440140782635336, 0.002381249982038837, 0.002146825452068144, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.001150052970321427, 0.0002865015237050364, 0.0029798150346815985, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.001775029606380323, 0.000833985914685474, -0.003770739075457816, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0006093176893524411, -0.00046905781658387527, 0.0034053217440919658, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0007450012183962096, 0.001298767353118675, -0.008499247802184222, -6.145165255574976e-05, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0011809726462884672, -0.0018384763902449712, 0.005411106715800028, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0021392341817010304, 0.0003259163122540385, -0.005276118905978749, -0.0019509840184772497, -9.545685077687876e-07, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0007772404694664217, -0.0001517954537059768, 0.006481484678129392, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 8.098064554131295e-05, -0.0024904264199929506, -0.0020718618328775897, -5.3411287747038166e-05, -0.0004556472202791715, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0022750984867578, 0.001716405971437602, 0.0003221344811922982, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0015560282437342534, 9.107229584202956e-05, 0.0008772841867241755, 0.0006502979194500701, -0.004128780661881036, 0.0006030386196211547, 0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0013959959731925453, 0.0026791526421029673, 0.002399500793142178, -0.00044960969955281656, 0.003101832495190209, 0.007494535809079955, 0.002864118744003058, -0.003052590549800204, 0.003420222341277871, 0.0014924017873988514, -0.0009357389226494119, 0.0007856229438140384, -0.001843397373255761, 1.6031851430693252e-05, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.000699901824825285, 0.0043822508549258565, -0.003541931476855951, -0.0028896746311921715, -0.0004873454583246359, -0.006087345141728267, 0.000388224886755815, 0.002533641621974457, -0.004352836429303485, -0.0006079421449756437, -0.003810133409713042, -0.0008284413779488711, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0010901530854686326, -0.013135007707490608, 0.0004734520308098294, 0.0020504232707536456, -0.006609452262924153, 0.0023647861306777536, 0.004678920703192049, -0.0018122526857900652, 0.0021375383049022263, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}]}
-```
\ No newline at end of file
+```
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
index 0c07866dba..683ada7b73 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
@@ -1,10 +1,10 @@
 {
     "inputs": [
         {
-            "data": "iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII=",
+            "data": ["iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII="],
             "datatype": "BYTES",
-            "name": "312a4eb0-0ca7-4803-a101-a6d2c18486fe",
-            "shape": -1
+            "name": "e8d5afed-0a56-4deb-ac9c-352663f51b93",
+            "shape": [-1]
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
index f065acd31f..71ef7d3b62 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
@@ -1,6 +1,6 @@
+import argparse
 import base64
 import json
-import argparse
 import uuid
 
 parser = argparse.ArgumentParser()
@@ -10,11 +10,20 @@
 image = open(args.filename, "rb")  # open binary file in read mode
 image_read = image.read()
 image_64_encode = base64.b64encode(image_read)
-bytes_array = image_64_encode.decode("utf-8")
+bytes_array = list(image_64_encode.decode("utf-8"))
 request = {
-    "inputs": [{"name": str(uuid.uuid4()), "shape": -1, "datatype": "BYTES", "data": bytes_array}]
+    "inputs": [
+        {
+            "name": str(uuid.uuid4()),
+            "shape": [-1],
+            "datatype": "BYTES",
+            "data": bytes_array,
+        }
+    ]
 }
 
-result_file = "{filename}.{ext}".format(filename=str(args.filename).split(".")[0], ext="json")
+result_file = "{filename}.{ext}".format(
+    filename=str(args.filename).split(".")[0], ext="json"
+)
 with open(result_file, "w") as outfile:
     json.dump(request, outfile, indent=4, sort_keys=True)
diff --git a/kubernetes/kserve/kserve_wrapper/README.md b/kubernetes/kserve/kserve_wrapper/README.md
index f235de00b7..54837b945d 100644
--- a/kubernetes/kserve/kserve_wrapper/README.md
+++ b/kubernetes/kserve/kserve_wrapper/README.md
@@ -77,7 +77,7 @@ grpc_management_port=7071
 enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
@@ -207,7 +207,7 @@ grpc_management_port=7071
 enable_envvars_config=true
 install_py_dep_per_model=true
 enable_metrics_api=true
-metrics_format=prometheus
+metrics_mode=prometheus
 NUM_WORKERS=1
 number_of_netty_threads=4
 job_queue_size=10
diff --git a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
index cf0aec512d..aa28a50aa7 100644
--- a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
+++ b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
@@ -1,23 +1,19 @@
 """ The torchserve side inference end-points request are handled to
     return a KServe side response """
-import json
 import logging
 import pathlib
-from typing import Dict
 
 import kserve
-import tornado.web
+from kserve.errors import ModelMissingError
 from kserve.model import Model as Model
-from kserve.model import ModelMissingError
 
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
+PREDICTOR_URL_FORMAT = PREDICTOR_V2_URL_FORMAT = "http://{0}/predictions/{1}"
+EXPLAINER_URL_FORMAT = EXPLAINER_V2_URL_FORMAT = "http://{0}/explanations/{1}"
 REGISTER_URL_FORMAT = "{0}/models?initial_workers=1&url={1}"
 UNREGISTER_URL_FORMAT = "{0}/models/{1}"
 
-PREDICTOR_URL_FORMAT = "http://{0}/v1/models/{1}:predict"
-EXPLAINER_URL_FORMAT = "http://{0}/v1/models/{1}:explain"
-
 
 class TorchserveModel(Model):
     """The torchserve side inference and explain end-points requests are handled to
@@ -49,76 +45,9 @@ def __init__(self, name, inference_address, management_address, model_dir):
         self.management_address = management_address
         self.model_dir = model_dir
 
-        logging.info("kfmodel Predict URL set to %s", self.predictor_host)
+        logging.info("Predict URL set to %s", self.predictor_host)
         self.explainer_host = self.predictor_host
-        logging.info("kfmodel Explain URL set to %s", self.explainer_host)
-
-    async def predict(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the inference endpoint and handles
-        the inference request and response from the Torchserve side and passes it on
-        to the KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the inference endpoint.
-        """
-        if not self.predictor_host:
-            raise NotImplementedError
-        logging.debug("kfmodel predict request is %s", json.dumps(request))
-        logging.info("PREDICTOR_HOST : %s", self.predictor_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            PREDICTOR_URL_FORMAT.format(self.predictor_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
-
-    async def explain(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the explain endpoint and handles the
-        explain request and response from the Torchserve side and passes it on to the
-        KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the explain endpoint.
-        """
-        if self.explainer_host is None:
-            raise NotImplementedError
-        logging.info("kfmodel explain request is %s", json.dumps(request))
-        logging.info("EXPLAINER_HOST : %s", self.explainer_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            EXPLAINER_URL_FORMAT.format(self.explainer_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
+        logging.info("Explain URL set to %s", self.explainer_host)
 
     def load(self) -> bool:
         """This method validates model availabilty in the model directory
@@ -129,10 +58,5 @@ def load(self) -> bool:
         existing_paths = [path for path in paths if path.exists()]
         if len(existing_paths) == 0:
             raise ModelMissingError(model_path)
-        elif len(existing_paths) > 1:
-            raise RuntimeError(
-                "More than one model file is detected, "
-                f"Only one is allowed within model_dir: {existing_paths}"
-            )
         self.ready = True
         return self.ready
diff --git a/kubernetes/kserve/kserve_wrapper/__main__.py b/kubernetes/kserve/kserve_wrapper/__main__.py
index e8063426fe..b31e3df375 100644
--- a/kubernetes/kserve/kserve_wrapper/__main__.py
+++ b/kubernetes/kserve/kserve_wrapper/__main__.py
@@ -12,7 +12,7 @@
 DEFAULT_MODEL_NAME = "model"
 DEFAULT_INFERENCE_ADDRESS = "http://127.0.0.1:8085"
 INFERENCE_PORT = "8085"
-DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8081"
+DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8085"
 
 DEFAULT_MODEL_STORE = "/mnt/models/model-store"
 CONFIG_PATH = "/mnt/models/config/config.properties"
@@ -31,10 +31,8 @@ def parse_config():
     keys = {}
 
     with open(CONFIG_PATH) as f:
-
         for line in f:
             if separator in line:
-
                 # Find the name and value by splitting the string
                 name, value = line.split(separator, 1)
 
@@ -79,13 +77,11 @@ def parse_config():
 
 
 if __name__ == "__main__":
-
     model_names, inference_address, management_address, model_dir = parse_config()
 
     models = []
 
     for model_name in model_names:
-
         model = TorchserveModel(
             model_name, inference_address, management_address, model_dir
         )
@@ -100,5 +96,5 @@ def parse_config():
     ModelServer(
         registered_models=registeredModels,
         http_port=8080,
-        grpc_port=7070,
+        grpc_port=8081,
     ).start(models)
diff --git a/model-archiver/README.md b/model-archiver/README.md
index 73571f3205..ae9df6f880 100644
--- a/model-archiver/README.md
+++ b/model-archiver/README.md
@@ -59,7 +59,7 @@ $ torch-model-archiver -h
 usage: torch-model-archiver [-h] --model-name MODEL_NAME  --version MODEL_VERSION_NUMBER
                       --model-file MODEL_FILE_PATH --serialized-file MODEL_SERIALIZED_PATH
                       --handler HANDLER [--runtime {python,python3}]
-                      [--export-path EXPORT_PATH] [-f] [--requirements-file]
+                      [--export-path EXPORT_PATH] [-f] [--requirements-file] [--config-file]
 
 Model Archiver Tool
 
@@ -78,7 +78,7 @@ optional arguments:
                         Path to python file containing model architecture.
                         This parameter is mandatory for eager mode models.
                         The model architecture file must contain only one
-                        class definition extended from torch.nn.modules.
+                        class definition extended from torch.nn.Module.
   --handler HANDLER     TorchServe's default handler name  or handler python
                         file path to handle custom TorchServe inference logic.
   --extra-files EXTRA_FILES
@@ -93,7 +93,7 @@ optional arguments:
                         is an optional parameter. If --export-path is not
                         specified, the file will be saved in the current
                         working directory.
-  --archive-format {tgz,default}
+  --archive-format {tgz, no-archive, zip-store, default}
                         The format in which the model artifacts are archived.
                         "tgz": This creates the model-archive in <model-name>.tar.gz format.
                         If platform hosting requires model-artifacts to be in ".tar.gz"
@@ -102,6 +102,9 @@ optional arguments:
                         at "export-path/{model-name}" location. As a result of this choice,
                         MANIFEST file will be created at "export-path/{model-name}" location
                         without archiving these model files
+                        "zip-store": This creates the model-archive in <model-name>.mar format
+                        but will skip deflating the files to speed up creation. Mainly used
+                        for testing purposes
                         "default": This creates the model-archive in <model-name>.mar format.
                         This is the default archiving format. Models archived in this format
                         will be readily hostable on TorchServe.
@@ -113,6 +116,7 @@ optional arguments:
   -r, --requirements-file
                         Path to requirements.txt file containing a list of model specific python
                         packages to be installed by TorchServe for seamless model serving.
+  -c, --config-file         Path to a model config yaml file.
 ```
 
 ## Artifact Details
@@ -132,6 +136,9 @@ A valid model name must begin with a letter of the alphabet and can only contain
 
 A model file should contain the model architecture. This file is mandatory in case of eager mode models.
 
+This file should contain a single class that inherits from
+[torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html).
+
 ### Serialized file
 
 A serialized file (.pt or .pth) should be a checkpoint in case of torchscript and state_dict in case of eager mode.
@@ -152,6 +159,32 @@ e.g. if your custom handler custom_image_classifier.py is in /home/serve/example
 
 For more details refer [default handler documentation](../docs/default_handlers.md) or [custom handler documentation](../docs/custom_service.md)
 
+### Config file
+
+A model config yaml file. For example: 
+
+```
+# TS frontend parameters
+# See all supported parameters: https://github.com/pytorch/serve/blob/master/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java#L14 
+minWorkers: 1 # default: #CPU or #GPU
+maxWorkers: 1 # default: #CPU or #GPU
+batchSize: 1 # default: 1
+maxBatchDelay: 100 # default: 100 msec
+responseTimeout: 120 # default: 120 sec
+deviceType: cpu # cpu, gpu, neuron
+deviceIds: [0,1,2,3] # gpu device ids allocated to this model. 
+parallelType: pp # pp: pipeline parallel; pptp: tensor+pipeline parallel. Default: empty
+
+# See torchrun parameters: https://pytorch.org/docs/stable/elastic/run.html
+torchrun:
+  nproc-per-node: 2
+
+# TS backend parameters
+pippy:
+  rpc_timeout: 1800
+  pp_group_size: 4 # pipeline parallel size, tp_group_size = world size / pp_group_size
+```
+
 ## Creating a Model Archive
 
 **1. Download the torch model archiver source**
@@ -179,7 +212,7 @@ This will package all the model artifacts files and output `densenet_161.mar` in
 
 
 ### Model specific custom python requirements
-Custom models/handlers may depend on different python packages which are not installed by-default as a part of `TorchServe` setup. 
+Custom models/handlers may depend on different python packages which are not installed by-default as a part of `TorchServe` setup.
 Supply a [python requirements](https://pip.pypa.io/en/stable/user_guide/#requirements-files) file containing the list of required python packages to be installed by `TorchServe` for seamless model serving using `--requirements-file` parameter while creating the model-archiver.
 
 Example:
diff --git a/model-archiver/model_archiver/arg_parser.py b/model-archiver/model_archiver/arg_parser.py
index d520ba3be2..17a54e6380 100644
--- a/model-archiver/model_archiver/arg_parser.py
+++ b/model-archiver/model_archiver/arg_parser.py
@@ -17,7 +17,6 @@ class ArgParser(object):
 
     @staticmethod
     def export_model_args_parser():
-
         """Argument parser for torch-model-export"""
 
         parser_export = argparse.ArgumentParser(
@@ -54,7 +53,7 @@ def export_model_args_parser():
             help="Path to python file containing model architecture.\n"
             "This parameter is mandatory for eager mode models.\n"
             "The model architecture file must contain only one\n"
-            "class definition extended from torch.nn.modules.",
+            "class definition extended from torch.nn.Module.",
         )
 
         parser_export.add_argument(
@@ -100,7 +99,7 @@ def export_model_args_parser():
             required=False,
             type=str,
             default="default",
-            choices=["tgz", "no-archive", "default"],
+            choices=["tgz", "no-archive", "zip-store", "default"],
             help="The format in which the model artifacts are archived.\n"
             '"tgz": This creates the model-archive in <model-name>.tar.gz format.\n'
             'If platform hosting TorchServe requires model-artifacts to be in ".tar.gz"\n'
@@ -109,6 +108,9 @@ def export_model_args_parser():
             'at "export-path/{model-name}" location. As a result of this choice, \n'
             'MANIFEST file will be created at "export-path/{model-name}" location\n'
             "without archiving these model files\n"
+            '"zip-store": This creates the model-archive in <model-name>.mar format\n'
+            "but will skip deflating the files to speed up creation. Mainly used\n"
+            "for testing purposes\n"
             '"default": This creates the model-archive in <model-name>.mar format.\n'
             "This is the default archiving format. Models archived in this format\n"
             "will be readily hostable on native TorchServe.\n",
@@ -143,4 +145,13 @@ def export_model_args_parser():
             " packages.",
         )
 
+        parser_export.add_argument(
+            "-c",
+            "--config-file",
+            required=False,
+            type=str,
+            default=None,
+            help="Path to a yaml file containing model configuration eg. batch_size.",
+        )
+
         return parser_export
diff --git a/model-archiver/model_archiver/manifest_components/manifest.py b/model-archiver/model_archiver/manifest_components/manifest.py
index 9828a69951..f42c804616 100644
--- a/model-archiver/model_archiver/manifest_components/manifest.py
+++ b/model-archiver/model_archiver/manifest_components/manifest.py
@@ -8,7 +8,6 @@
 
 
 class RuntimeType(Enum):
-
     PYTHON = "python"
     PYTHON3 = "python3"
 
@@ -19,7 +18,6 @@ class Manifest(object):
     """
 
     def __init__(self, runtime, model):
-
         self.creation_time = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
         self.runtime = RuntimeType(runtime)
         self.model = model
diff --git a/model-archiver/model_archiver/manifest_components/model.py b/model-archiver/model_archiver/manifest_components/model.py
index b985788599..e250ad61bd 100644
--- a/model-archiver/model_archiver/manifest_components/model.py
+++ b/model-archiver/model_archiver/manifest_components/model.py
@@ -1,6 +1,6 @@
 # pylint: disable=missing-docstring
 import json
-import sys
+import os
 
 
 class Model(object):
@@ -9,48 +9,55 @@ class Model(object):
     as the entry point into the service code through the handler property
     """
 
-    def __init__(self, model_name, serialized_file, handler, model_file=None, model_version=None,
-                 extensions=None, requirements_file=None):
-
+    def __init__(
+        self,
+        model_name,
+        serialized_file,
+        handler,
+        model_file=None,
+        model_version=None,
+        extensions=None,
+        requirements_file=None,
+        config_file=None,
+    ):
         self.model_name = model_name
         self.serialized_file = None
         if serialized_file:
-            if sys.platform.startswith('win32') and serialized_file.find("\\") != -1:
-                self.serialized_file = serialized_file.split("\\")[-1]
-            else:
-                self.serialized_file = serialized_file.split("/")[-1]
+            self.serialized_file = os.path.basename(serialized_file)
         self.model_file = model_file
         self.model_version = model_version
         self.extensions = extensions
-        if sys.platform.startswith('win32') and handler.find("\\") != -1:
-            self.handler = handler.split("\\")[-1]
-        else:
-            self.handler = handler.split("/")[-1]
+        self.handler = os.path.basename(handler)
         self.requirements_file = requirements_file
-
+        self.config_file = None
+        if config_file:
+            self.config_file = os.path.basename(config_file)
         self.model_dict = self.__to_dict__()
 
     def __to_dict__(self):
-        model_dict = dict()
+        model_dict = {}
 
-        model_dict['modelName'] = self.model_name
+        model_dict["modelName"] = self.model_name
 
         if self.serialized_file:
-            model_dict['serializedFile'] = self.serialized_file
+            model_dict["serializedFile"] = self.serialized_file
 
-        model_dict['handler'] = self.handler
+        model_dict["handler"] = self.handler
 
         if self.model_file:
-            model_dict['modelFile'] = self.model_file.split("/")[-1]
+            model_dict["modelFile"] = self.model_file.split("/")[-1]
 
         if self.model_version:
-            model_dict['modelVersion'] = self.model_version
+            model_dict["modelVersion"] = self.model_version
 
         if self.extensions:
-            model_dict['extensions'] = self.extensions
+            model_dict["extensions"] = self.extensions
 
         if self.requirements_file:
-            model_dict['requirementsFile'] = self.requirements_file.split("/")[-1]
+            model_dict["requirementsFile"] = self.requirements_file.split("/")[-1]
+
+        if self.config_file:
+            model_dict["configFile"] = self.config_file
 
         return model_dict
 
diff --git a/model-archiver/model_archiver/model_packaging.py b/model-archiver/model_archiver/model_packaging.py
index bcbbfda732..3304f6a4f1 100644
--- a/model-archiver/model_archiver/model_packaging.py
+++ b/model-archiver/model_archiver/model_packaging.py
@@ -22,6 +22,7 @@ def package_model(args, manifest):
     extra_files = args.extra_files
     export_file_path = args.export_path
     requirements_file = args.requirements_file
+    config_file = args.config_file
 
     try:
         ModelExportUtils.validate_inputs(model_name, export_file_path)
@@ -37,6 +38,7 @@ def package_model(args, manifest):
             "handler": handler,
             "extra_files": extra_files,
             "requirements-file": requirements_file,
+            "config_file": config_file,
         }
 
         model_path = ModelExportUtils.copy_artifacts(model_name, **artifact_files)
diff --git a/model-archiver/model_archiver/model_packaging_utils.py b/model-archiver/model_archiver/model_packaging_utils.py
index 40eabbd249..86647a1514 100644
--- a/model-archiver/model_archiver/model_packaging_utils.py
+++ b/model-archiver/model_archiver/model_packaging_utils.py
@@ -2,6 +2,7 @@
 Helper utils for Model Export tool
 """
 
+import glob
 import logging
 import os
 import re
@@ -10,12 +11,18 @@
 import tempfile
 import zipfile
 from io import BytesIO
+from pathlib import Path
 
 from .manifest_components.manifest import Manifest
 from .manifest_components.model import Model
 from .model_archiver_error import ModelArchiverError
 
-archiving_options = {"tgz": ".tar.gz", "no-archive": "", "default": ".mar"}
+archiving_options = {
+    "tgz": ".tar.gz",
+    "no-archive": "",
+    "zip-store": ".mar",
+    "default": ".mar",
+}
 
 
 model_handlers = {
@@ -107,6 +114,7 @@ def generate_model(modelargs):
             handler=modelargs.handler,
             model_version=modelargs.version,
             requirements_file=modelargs.requirements_file,
+            config_file=modelargs.config_file,
         )
         return model
 
@@ -156,20 +164,24 @@ def copy_artifacts(model_name, **kwargs):
                         path = (path.split(":")[0] if ":" in path else path) + ".py"
 
                 if file_type == "extra_files":
-                    for file in path.split(","):
-                        file = file.strip()
-                        if os.path.isfile(file):
-                            shutil.copy2(file, model_path)
-                        elif os.path.isdir(file) and file != model_path:
-                            for item in os.listdir(file):
-                                src = os.path.join(file, item)
-                                dst = os.path.join(model_path, item)
-                                if os.path.isfile(src):
-                                    shutil.copy2(src, dst)
-                                elif os.path.isdir(src):
-                                    shutil.copytree(src, dst, False, None)
-                        else:
-                            raise ValueError(f"Invalid extra file given {file}")
+                    for path_or_wildcard in path.split(","):
+                        if not Path(path_or_wildcard).exists():
+                            raise FileNotFoundError(
+                                f"File does not exist: {path_or_wildcard}"
+                            )
+                        for file in glob.glob(path_or_wildcard.strip()):
+                            if os.path.isfile(file):
+                                shutil.copy2(file, model_path)
+                            elif os.path.isdir(file) and file != model_path:
+                                for item in os.listdir(file):
+                                    src = os.path.join(file, item)
+                                    dst = os.path.join(model_path, item)
+                                    if os.path.isfile(src):
+                                        shutil.copy2(src, dst)
+                                    elif os.path.isdir(src):
+                                        shutil.copytree(src, dst, False, None)
+                            else:
+                                raise ValueError(f"Invalid extra file given {file}")
                 else:
                     shutil.copy(path, model_path)
 
@@ -216,7 +228,12 @@ def archive(
                 with open(os.path.join(manifest_path, MANIFEST_FILE_NAME), "w") as f:
                     f.write(manifest)
             else:
-                with zipfile.ZipFile(mar_path, "w", zipfile.ZIP_DEFLATED) as z:
+                zip_mode = (
+                    zipfile.ZIP_STORED
+                    if archive_format == "zip-store"
+                    else zipfile.ZIP_DEFLATED
+                )
+                with zipfile.ZipFile(mar_path, "w", zip_mode) as z:
                     ModelExportUtils.archive_dir(
                         model_path, z, archive_format, model_name
                     )
@@ -235,7 +252,6 @@ def archive(
 
     @staticmethod
     def archive_dir(path, dst, archive_format, model_name):
-
         """
         This method zips the dir and filters out some files based on a expression
         :param archive_format:
diff --git a/model-archiver/model_archiver/tests/conftest.py b/model-archiver/model_archiver/tests/conftest.py
new file mode 100644
index 0000000000..78fc4d52f7
--- /dev/null
+++ b/model-archiver/model_archiver/tests/conftest.py
@@ -0,0 +1,50 @@
+import json
+from pathlib import Path
+
+import pytest
+
+INTEG_TEST_CONFIG_FILE = "integ_tests/configuration.json"
+DEFAULT_HANDLER_CONFIG_FILE = "integ_tests/default_handler_configuration.json"
+
+TEST_ROOT_DIR = Path(__file__).parent
+MODEL_ARCHIVER_ROOT_DIR = Path(__file__).parents[2]
+
+
+def make_paths_absolute(test, keys):
+    def make_absolute(paths):
+        if "," in paths:
+            return ",".join([make_absolute(p) for p in paths.split(",")])
+        return MODEL_ARCHIVER_ROOT_DIR.joinpath(paths).as_posix()
+
+    for k in keys:
+        test[k] = make_absolute(test[k])
+
+    return test
+
+
+@pytest.fixture(name="integ_tests")
+def load_integ_tests():
+    with open(TEST_ROOT_DIR.joinpath(INTEG_TEST_CONFIG_FILE), "r") as f:
+        tests = json.loads(f.read())
+    keys = (
+        "model-file",
+        "serialized-file",
+        "handler",
+        "extra-files",
+    )
+    return [make_paths_absolute(t, keys) for t in tests]
+
+
+@pytest.fixture(name="default_handler_tests")
+def load_default_handler_tests():
+    with open(TEST_ROOT_DIR.joinpath(DEFAULT_HANDLER_CONFIG_FILE), "r") as f:
+        default_handler_tests = json.loads(f.read())
+    keys = (
+        "model-file",
+        "serialized-file",
+        "extra-files",
+    )
+    default_handler_tests = [
+        make_paths_absolute(t, keys) for t in default_handler_tests
+    ]
+    return default_handler_tests
diff --git a/model-archiver/model_archiver/tests/integ_tests/configuration.json b/model-archiver/model_archiver/tests/integ_tests/configuration.json
index 819164e7b5..565726e44f 100644
--- a/model-archiver/model_archiver/tests/integ_tests/configuration.json
+++ b/model-archiver/model_archiver/tests/integ_tests/configuration.json
@@ -60,6 +60,20 @@
     "version": "1.0",
     "force": true
   },
+  {
+    "name": "packaging_zip_store_mar",
+    "model-name": "model",
+    "model-file": "model_archiver/tests/integ_tests/resources/regular_model/test_model.py",
+    "serialized-file": "model_archiver/tests/integ_tests/resources/regular_model/test_serialized_file.pt",
+    "handler": "model_archiver/tests/integ_tests/resources/regular_model/test_handler.py",
+    "extra-files": "model_archiver/tests/integ_tests/resources/regular_model/test_index_to_name.json",
+    "export-path": "model",
+    "archive-format": "zip-store",
+    "iterations": 2,
+    "version": "1.0",
+    "force": true,
+    "config-file": ""
+  },
   {
     "name": "packaging_mar_with_handler_name",
     "model-name": "model",
@@ -85,4 +99,32 @@
     "iterations": 2,
     "version": "1.0",
     "force": true
-  }]
\ No newline at end of file
+  },
+  {
+    "name": "extra_files_path",
+    "model-name": "model",
+    "model-file": "model_archiver/tests/integ_tests/resources/regular_model/test_model.py",
+    "serialized-file": "model_archiver/tests/integ_tests/resources/regular_model/test_serialized_file.pt",
+    "handler": "model_archiver/tests/integ_tests/resources/regular_model/test_handler",
+    "extra-files": "model_archiver/tests/integ_tests/resources/regular_model/",
+    "export-path": "model",
+    "archive-format": "default",
+    "iterations": 1,
+    "version": "1.0",
+    "config-file": "",
+    "expect-error": false
+  },
+  {
+    "name": "missing_extra_files",
+    "model-name": "model",
+    "model-file": "model_archiver/tests/integ_tests/resources/regular_model/test_model.py",
+    "serialized-file": "model_archiver/tests/integ_tests/resources/regular_model/test_serialized_file.pt",
+    "handler": "model_archiver/tests/integ_tests/resources/regular_model/test_handler",
+    "extra-files": "model_archiver/tests/integ_tests/resources/regular_model/missing.json",
+    "export-path": "model",
+    "archive-format": "default",
+    "iterations": 1,
+    "version": "1.0",
+    "config-file": "",
+    "expect-error": true
+  }]
diff --git a/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py b/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py
index 89b4a907ba..089bed7b66 100644
--- a/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py
+++ b/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py
@@ -1,16 +1,23 @@
-import platform
-import time
-from datetime import datetime
 import errno
 import json
 import os
+import platform
 import shutil
 import tempfile
-import subprocess
+import time
+from argparse import Namespace
+from datetime import datetime
+from pathlib import Path
+
 import model_archiver
 
 DEFAULT_RUNTIME = "python"
 MANIFEST_FILE = "MAR-INF/MANIFEST.json"
+INTEG_TEST_CONFIG_FILE = "integ_tests/configuration.json"
+DEFAULT_HANDLER_CONFIG_FILE = "integ_tests/default_handler_configuration.json"
+
+TEST_ROOT_DIR = Path(__file__).parents[1]
+MODEL_ARCHIVER_ROOT_DIR = Path(__file__).parents[3]
 
 
 def create_file_path(path):
@@ -33,27 +40,43 @@ def delete_file_path(path):
         pass
 
 
-def run_test(test, cmd):
-    it = test.get("iterations") if test.get("iterations") is not None else 1
+def run_test(test, args, mocker):
+    m = mocker.patch(
+        "model_archiver.model_packaging.ArgParser.export_model_args_parser",
+    )
+    m.return_value.parse_args.return_value = args
+    mocker.patch("sys.exit", side_effect=Exception())
+    from model_archiver.model_packaging import generate_model_archive
+
+    it = test.get("iterations", 1)
     for i in range(it):
         try:
-            subprocess.check_call(cmd, shell=True)
-        except subprocess.CalledProcessError as exc:
+            generate_model_archive()
+        except Exception as exc:
             if test.get("expect-error") is not True:
-                assert 0, "{}".format(exc.output)
+                assert 0, str(exc)
             else:
                 return 0
+    # In case we expect an error we should not be here
+    if test.get("expect-error") is True:
+        assert 0, f"Error expected in test: {test['name']}"
     return 1
 
 
 def validate_archive_exists(test):
     fmt = test.get("archive-format")
     if fmt == "tgz":
-        assert os.path.isfile(os.path.join(test.get("export-path"), test.get("model-name")+".tar.gz"))
+        assert os.path.isfile(
+            os.path.join(test.get("export-path"), test.get("model-name") + ".tar.gz")
+        )
     elif fmt == "no-archive":
-        assert os.path.isdir(os.path.join(test.get("export-path"), test.get("model-name")))
+        assert os.path.isdir(
+            os.path.join(test.get("export-path"), test.get("model-name"))
+        )
     else:
-        assert os.path.isfile(os.path.join(test.get("export-path"), test.get("model-name")+".mar"))
+        assert os.path.isfile(
+            os.path.join(test.get("export-path"), test.get("model-name") + ".mar")
+        )
 
 
 def validate_manifest_file(manifest, test, default_handler=None):
@@ -67,7 +90,9 @@ def validate_manifest_file(manifest, test, default_handler=None):
     assert manifest.get("runtime") == test.get("runtime")
     assert manifest.get("model").get("modelName") == test.get("model-name")
     if not default_handler:
-        assert manifest.get("model").get("handler") == test.get("handler").split("/")[-1]
+        assert (
+            manifest.get("model").get("handler") == test.get("handler").split("/")[-1]
+        )
     else:
         assert manifest.get("model").get("handler") == test.get("handler")
     assert manifest.get("archiverVersion") == model_archiver.__version__
@@ -87,21 +112,29 @@ def validate_files(file_list, prefix, default_handler=None):
 
 def validate_tar_archive(test_cfg):
     import tarfile
-    file_name = os.path.join(test_cfg.get("export-path"), test_cfg.get("model-name") + ".tar.gz")
+
+    file_name = os.path.join(
+        test_cfg.get("export-path"), test_cfg.get("model-name") + ".tar.gz"
+    )
     f = tarfile.open(file_name, "r:gz")
-    manifest = json.loads(f.extractfile(os.path.join(test_cfg.get("model-name"), MANIFEST_FILE)).read())
+    manifest = json.loads(
+        f.extractfile(os.path.join(test_cfg.get("model-name"), MANIFEST_FILE)).read()
+    )
     validate_manifest_file(manifest, test_cfg)
     validate_files(f.getnames(), test_cfg.get("model-name"))
 
 
 def validate_noarchive_archive(test):
-    file_name = os.path.join(test.get("export-path"), test.get("model-name"), MANIFEST_FILE)
+    file_name = os.path.join(
+        test.get("export-path"), test.get("model-name"), MANIFEST_FILE
+    )
     manifest = json.loads(open(file_name).read())
     validate_manifest_file(manifest, test)
 
 
 def validate_mar_archive(test):
     import zipfile
+
     file_name = os.path.join(test.get("export-path"), test.get("model-name") + ".mar")
     zf = zipfile.ZipFile(file_name, "r")
     manifest = json.loads(zf.open(MANIFEST_FILE).read())
@@ -123,58 +156,101 @@ def validate(test):
     validate_archive_content(test)
 
 
-def build_cmd(test):
-    args = ['model-name', 'model-file', 'serialized-file', 'handler', 'extra-files', 'archive-format',
-            'version', 'export-path', 'runtime']
-
-    cmd = ["torch-model-archiver"]
-
-    for arg in args:
-        if arg in test:
-            cmd.append("--{0} {1}".format(arg, test[arg]))
-
-    return " ".join(cmd)
-
-
-def test_model_archiver():
-    with open("model_archiver/tests/integ_tests/configuration.json", "r") as f:
-        tests = json.loads(f.read())
-        for test in tests:
-            # tar.gz format problem on windows hence ignore
-            if platform.system() == "Windows" and test['archive-format'] == 'tgz':
-                continue
-            try:
-                test["export-path"] = os.path.join(tempfile.gettempdir(), test["export-path"])
-                delete_file_path(test.get("export-path"))
-                create_file_path(test.get("export-path"))
-                test["runtime"] = test.get("runtime", DEFAULT_RUNTIME)
-                test["model-name"] = test["model-name"] + '_' + str(int(time.time()*1000.0))
-                cmd = build_cmd(test)
-                if test.get("force"):
-                    cmd += " -f"
-
-                if run_test(test, cmd):
-                    validate(test)
-            finally:
-                delete_file_path(test.get("export-path"))
-
-
-def test_default_handlers():
-    with open("model_archiver/tests/integ_tests/default_handler_configuration.json", "r") as f:
-        tests = json.loads(f.read())
-        for test in tests:
-            cmd = build_cmd(test)
-            try:
-                delete_file_path(test.get("export-path"))
-                create_file_path(test.get("export-path"))
-
-                if test.get("force"):
-                    cmd += " -f"
-
-                if run_test(test, cmd):
-                    validate(test)
-            finally:
-                delete_file_path(test.get("export-path"))
+def build_namespace(test):
+    keys = [
+        "model-name",
+        "model-file",
+        "serialized-file",
+        "handler",
+        "extra-files",
+        "archive-format",
+        "version",
+        "export-path",
+        "runtime",
+        "requirements-file",
+        "config-file",
+        "force",
+    ]
+    test["requirements-file"] = None
+    test["config-file"] = None
+    test["force"] = test.get("force", False)
+    test["runtime"] = test.get("runtime", DEFAULT_RUNTIME)
+    test["archive-format"] = test.get("archive-format", "default")
+
+    args = Namespace(**{k.replace("-", "_"): test[k] for k in keys})
+
+    return args
+
+
+def make_paths_absolute(test, keys):
+    def make_absolute(paths):
+        if "," in paths:
+            return ",".join([make_absolute(p) for p in paths.split(",")])
+        return MODEL_ARCHIVER_ROOT_DIR.joinpath(paths).as_posix()
+
+    for k in keys:
+        test[k] = make_absolute(test[k])
+
+    return test
+
+
+def test_model_archiver(integ_tests, mocker):
+    for test in integ_tests:
+        # tar.gz format problem on windows hence ignore
+        if platform.system() == "Windows" and test["archive-format"] == "tgz":
+            continue
+        try:
+            test["export-path"] = os.path.join(
+                tempfile.gettempdir(), test["export-path"]
+            )
+            delete_file_path(test.get("export-path"))
+            create_file_path(test.get("export-path"))
+            test["runtime"] = test.get("runtime", DEFAULT_RUNTIME)
+            test["model-name"] = (
+                test["model-name"] + "_" + str(int(time.time() * 1000.0))
+            )
+            args = build_namespace(test)
+
+            if run_test(test, args, mocker):
+                validate(test)
+        finally:
+            delete_file_path(test.get("export-path"))
+
+
+def test_default_handlers(default_handler_tests, mocker):
+    for test in default_handler_tests:
+        cmd = build_namespace(test)
+        try:
+            delete_file_path(test.get("export-path"))
+            create_file_path(test.get("export-path"))
+
+            if run_test(test, cmd, mocker):
+                validate(test)
+        finally:
+            delete_file_path(test.get("export-path"))
+
+
+def test_zip_store(tmp_path, integ_tests, mocker):
+    integ_tests = list(
+        filter(lambda t: t["name"] == "packaging_zip_store_mar", integ_tests)
+    )
+    assert len(integ_tests) == 1
+    test = integ_tests[0]
+
+    test["export-path"] = tmp_path
+    test["iterations"] = 1
+
+    test["model-name"] = "zip-store"
+    run_test(test, build_namespace(test), mocker)
+
+    test["model-name"] = "zip"
+    test["archive-format"] = "default"
+    run_test(test, build_namespace(test), mocker)
+
+    stored_size = Path(tmp_path).joinpath("zip-store.mar").stat().st_size
+    zipped_size = Path(tmp_path).joinpath("zip.mar").stat().st_size
+
+    assert zipped_size < stored_size
 
 
 if __name__ == "__main__":
diff --git a/model-archiver/model_archiver/tests/unit_tests/test_model_packaging.py b/model-archiver/model_archiver/tests/unit_tests/test_model_packaging.py
index 6297318704..da49fb1fc1 100644
--- a/model-archiver/model_archiver/tests/unit_tests/test_model_packaging.py
+++ b/model-archiver/model_archiver/tests/unit_tests/test_model_packaging.py
@@ -1,14 +1,6 @@
-
-
 from collections import namedtuple
 
 import pytest
-import sys
-from mock import MagicMock
-
-sys.modules['shutil'] = MagicMock()
-sys.modules['shutil.rmtree'] = MagicMock()
-
 from model_archiver.manifest_components.manifest import RuntimeType
 from model_archiver.model_packaging import generate_model_archive, package_model
 from model_archiver.model_packaging_utils import ModelExportUtils
@@ -16,7 +8,6 @@
 
 # noinspection PyClassHasNoInit
 class TestModelPackaging:
-
     class Namespace:
         def __init__(self, **kwargs):
             self.__dict__.update(kwargs)
@@ -24,26 +15,42 @@ def __init__(self, **kwargs):
         def update(self, **kwargs):
             self.__dict__.update(kwargs)
 
-    model_name = 'my-model'
-    model_file = 'my-model/'
-    serialized_file = 'my-model/'
-    handler = 'a.py::my-awesome-func'
-    export_path = '/Users/dummyUser/'
-    version = '1.0'
+    model_name = "my-model"
+    model_file = "my-model/"
+    serialized_file = "my-model/"
+    handler = "a.py::my-awesome-func"
+    export_path = "/Users/dummyUser/"
+    version = "1.0"
     requirements_file = "requirements.txt"
+    config_file = None
     source_vocab = None
 
-    args = Namespace(model_name=model_name, handler=handler, runtime=RuntimeType.PYTHON.value, model_file=model_file,
-                     serialized_file=serialized_file, extra_files=None, export_path=export_path, force=False,
-                     archive_format="default", convert=False, version=version, source_vocab=source_vocab,
-                     requirements_file=requirements_file)
+    args = Namespace(
+        model_name=model_name,
+        handler=handler,
+        runtime=RuntimeType.PYTHON.value,
+        model_file=model_file,
+        serialized_file=serialized_file,
+        extra_files=None,
+        export_path=export_path,
+        force=False,
+        archive_format="default",
+        convert=False,
+        version=version,
+        source_vocab=source_vocab,
+        requirements_file=requirements_file,
+        config_file=None,
+    )
 
     @pytest.fixture()
     def patches(self, mocker):
-        Patches = namedtuple('Patches', ['arg_parse', 'export_utils', 'export_method'])
-        patches = Patches(mocker.patch('model_archiver.model_packaging.ArgParser'),
-                          mocker.patch('model_archiver.model_packaging.ModelExportUtils'),
-                          mocker.patch('model_archiver.model_packaging.package_model'))
+        Patches = namedtuple("Patches", ["arg_parse", "export_utils", "export_method"])
+        patches = Patches(
+            mocker.patch("model_archiver.model_packaging.ArgParser"),
+            mocker.patch("model_archiver.model_packaging.ModelExportUtils"),
+            mocker.patch("model_archiver.model_packaging.package_model"),
+        )
+        mocker.patch("shutil.rmtree")
 
         return patches
 
@@ -53,8 +60,11 @@ def test_gen_model_archive(self, patches):
         patches.export_method.assert_called()
 
     def test_export_model_method(self, patches):
-        patches.export_utils.check_mar_already_exists.return_value = '/Users/dummyUser/'
-        patches.export_utils.check_custom_model_types.return_value = '/Users/dummyUser', ['a.txt', 'b.txt']
+        patches.export_utils.check_mar_already_exists.return_value = "/Users/dummyUser/"
+        patches.export_utils.check_custom_model_types.return_value = (
+            "/Users/dummyUser",
+            ["a.txt", "b.txt"],
+        )
         patches.export_utils.zip.return_value = None
 
         package_model(self.args, ModelExportUtils.generate_manifest_json(self.args))
@@ -63,8 +73,11 @@ def test_export_model_method(self, patches):
 
     def test_export_model_method_tar(self, patches):
         self.args.update(archive_format="tar")
-        patches.export_utils.check_mar_already_exists.return_value = '/Users/dummyUser/'
-        patches.export_utils.check_custom_model_types.return_value = '/Users/dummyUser', ['a.txt', 'b.txt']
+        patches.export_utils.check_mar_already_exists.return_value = "/Users/dummyUser/"
+        patches.export_utils.check_custom_model_types.return_value = (
+            "/Users/dummyUser",
+            ["a.txt", "b.txt"],
+        )
         patches.export_utils.zip.return_value = None
 
         package_model(self.args, ModelExportUtils.generate_manifest_json(self.args))
@@ -73,8 +86,11 @@ def test_export_model_method_tar(self, patches):
 
     def test_export_model_method_noarchive(self, patches):
         self.args.update(archive_format="no-archive")
-        patches.export_utils.check_mar_already_exists.return_value = '/Users/dummyUser/'
-        patches.export_utils.check_custom_model_types.return_value = '/Users/dummyUser', ['a.txt', 'b.txt']
+        patches.export_utils.check_mar_already_exists.return_value = "/Users/dummyUser/"
+        patches.export_utils.check_custom_model_types.return_value = (
+            "/Users/dummyUser",
+            ["a.txt", "b.txt"],
+        )
         patches.export_utils.zip.return_value = None
 
         package_model(self.args, ModelExportUtils.generate_manifest_json(self.args))
diff --git a/model-archiver/model_archiver/tests/unit_tests/test_model_packaging_utils.py b/model-archiver/model_archiver/tests/unit_tests/test_model_packaging_utils.py
index 36a15fd2da..cc453be94a 100644
--- a/model-archiver/model_archiver/tests/unit_tests/test_model_packaging_utils.py
+++ b/model-archiver/model_archiver/tests/unit_tests/test_model_packaging_utils.py
@@ -1,13 +1,14 @@
-
-
 import json
 import platform
+from collections import namedtuple
+from pathlib import Path
 
 import pytest
-from collections import namedtuple
-from model_archiver.model_packaging_utils import ModelExportUtils
 from model_archiver.manifest_components.manifest import RuntimeType
 from model_archiver.model_archiver_error import ModelArchiverError
+from model_archiver.model_packaging_utils import ModelExportUtils
+
+MANIFEST_FILE = Path(__file__).parents[1].joinpath("integ_tests/MAR-INF/MANIFEST.json")
 
 
 # noinspection PyClassHasNoInit
@@ -15,111 +16,125 @@ def _validate_mar(patches):
     if platform.system() == "Windows":
         patches.path_exists.assert_called_once_with("/Users/dummyUser\\some-model.mar")
     else:
-        patches.path_exists.assert_called_once_with('/Users/dummyUser/some-model.mar')
+        patches.path_exists.assert_called_once_with("/Users/dummyUser/some-model.mar")
+
 
 # noinspection PyClassHasNoInit
 
 
 class TestExportModelUtils:
-
     # noinspection PyClassHasNoInit
     class TestMarExistence:
-
         @pytest.fixture()
         def patches(self, mocker):
-            Patches = namedtuple('Patches', ['getcwd', 'path_exists'])
-            patches = Patches(mocker.patch('os.getcwd'), mocker.patch('os.path.exists'))
-            patches.getcwd.return_value = '/Users/dummyUser'
+            Patches = namedtuple("Patches", ["getcwd", "path_exists"])
+            patches = Patches(mocker.patch("os.getcwd"), mocker.patch("os.path.exists"))
+            patches.getcwd.return_value = "/Users/dummyUser"
             return patches
 
         def test_export_file_is_none(self, patches):
             patches.path_exists.return_value = False
-            ret_val = ModelExportUtils.check_mar_already_exists('some-model', None, False)
+            ret_val = ModelExportUtils.check_mar_already_exists(
+                "some-model", None, False
+            )
             _validate_mar(patches)
             assert ret_val == "/Users/dummyUser"
 
         def test_export_file_is_not_none(self, patches):
             patches.path_exists.return_value = False
-            ModelExportUtils.check_mar_already_exists('some-model', '/Users/dummyUser/', False)
-            patches.path_exists.assert_called_once_with('/Users/dummyUser/some-model.mar')
+            ModelExportUtils.check_mar_already_exists(
+                "some-model", "/Users/dummyUser/", False
+            )
+            patches.path_exists.assert_called_once_with(
+                "/Users/dummyUser/some-model.mar"
+            )
 
         def test_export_file_already_exists_with_override(self, patches):
             patches.path_exists.return_value = True
-            ModelExportUtils.check_mar_already_exists('some-model', None, True)
+            ModelExportUtils.check_mar_already_exists("some-model", None, True)
             _validate_mar(patches)
 
         def test_export_file_already_exists_with_override_false(self, patches):
             patches.path_exists.return_value = True
             with pytest.raises(ModelArchiverError):
-                ModelExportUtils.check_mar_already_exists('some-model', None, False)
+                ModelExportUtils.check_mar_already_exists("some-model", None, False)
             _validate_mar(patches)
 
         def test_export_file_is_none_tar(self, patches):
             patches.path_exists.return_value = False
-            ret_val = ModelExportUtils.check_mar_already_exists('some-model', None, False, archive_format='tgz')
+            ret_val = ModelExportUtils.check_mar_already_exists(
+                "some-model", None, False, archive_format="tgz"
+            )
 
             if platform.system() == "Windows":
-                patches.path_exists.assert_called_once_with("/Users/dummyUser\\some-model.tar.gz")
+                patches.path_exists.assert_called_once_with(
+                    "/Users/dummyUser\\some-model.tar.gz"
+                )
             else:
-                patches.path_exists.assert_called_once_with("/Users/dummyUser/some-model.tar.gz")
+                patches.path_exists.assert_called_once_with(
+                    "/Users/dummyUser/some-model.tar.gz"
+                )
             assert ret_val == "/Users/dummyUser"
 
     class TestArchiveTypes:
         def test_archive_types(self):
-            from model_archiver.model_packaging_utils import archiving_options as ar_opts
+            from model_archiver.model_packaging_utils import (
+                archiving_options as ar_opts,
+            )
+
             assert ar_opts.get("tgz") == ".tar.gz"
             assert ar_opts.get("no-archive") == ""
+            assert ar_opts.get("zip-store") == ".mar"
             assert ar_opts.get("default") == ".mar"
-            assert len(ar_opts) == 3
+            assert len(ar_opts) == 4
 
     # noinspection PyClassHasNoInit
     class TestCustomModelTypes:
-        
-        model_path = '/Users/dummyUser'
+        model_path = "/Users/dummyUser"
 
         @pytest.fixture()
         def patches(self, mocker):
-            Patches = namedtuple('Patches', ['utils', 'listdir'])
-            patch = Patches(mocker.patch('model_archiver.model_packaging_utils.ModelExportUtils'),
-                            mocker.patch('os.listdir'))
+            Patches = namedtuple("Patches", ["utils", "listdir"])
+            patch = Patches(
+                mocker.patch("model_archiver.model_packaging_utils.ModelExportUtils"),
+                mocker.patch("os.listdir"),
+            )
 
-            patch.listdir.return_value = {'a', 'b', 'c'}
+            patch.listdir.return_value = {"a", "b", "c"}
             return patch
 
     # noinspection PyClassHasNoInit
     class TestFindUnique:
-
         def test_with_count_zero(self):
-            files = ['a.txt', 'b.txt', 'c.txt']
-            suffix = '.mxnet'
+            files = ["a.txt", "b.txt", "c.txt"]
+            suffix = ".mxnet"
             val = ModelExportUtils.find_unique(files, suffix)
             assert val is None
 
         def test_with_count_one(self):
-            files = ['a.mxnet', 'b.txt', 'c.txt']
-            suffix = '.mxnet'
+            files = ["a.mxnet", "b.txt", "c.txt"]
+            suffix = ".mxnet"
             val = ModelExportUtils.find_unique(files, suffix)
-            assert val == 'a.mxnet'
+            assert val == "a.mxnet"
 
         def test_with_exit(self):
-            files = ['a.onnx', 'b.onnx', 'c.txt']
-            suffix = '.onnx'
+            files = ["a.onnx", "b.onnx", "c.txt"]
+            suffix = ".onnx"
             with pytest.raises(ModelArchiverError):
                 ModelExportUtils.find_unique(files, suffix)
 
     # noinspection PyClassHasNoInit
     class TestCleanTempFiles:
-
         @pytest.fixture()
         def patches(self, mocker):
-            Patches = namedtuple('Patches', ['remove'])
-            patches = Patches(mocker.patch('os.remove'))
+            Patches = namedtuple("Patches", ["remove"])
+            patches = Patches(mocker.patch("os.remove"))
 
             patches.remove.return_value = True
             return patches
 
         def test_clean_call(self, patches):
-            temp_files = ['a', 'b', 'c']
+            temp_files = ["a", "b", "c"]
             ModelExportUtils.clean_temp_files(temp_files)
 
             patches.remove.assert_called()
@@ -127,21 +142,27 @@ def test_clean_call(self, patches):
 
     # noinspection PyClassHasNoInit
     class TestGenerateManifestProps:
-
         class Namespace:
             def __init__(self, **kwargs):
                 self.__dict__.update(kwargs)
 
-        model_name = 'my-model'
-        handler = 'a.py::my-awesome-func'
-        serialized_file = 'model.pt'
-        model_file = 'model.pt'
+        model_name = "my-model"
+        handler = "a.py::my-awesome-func"
+        serialized_file = "model.pt"
+        model_file = "model.pt"
         version = "1.0"
         requirements_file = "requirements.txt"
 
-        args = Namespace(model_name=model_name, handler=handler, runtime=RuntimeType.PYTHON.value,
-                         serialized_file=serialized_file, model_file=model_file, version=version,
-                         requirements_file=requirements_file)
+        args = Namespace(
+            model_name=model_name,
+            handler=handler,
+            runtime=RuntimeType.PYTHON.value,
+            serialized_file=serialized_file,
+            model_file=model_file,
+            version=version,
+            requirements_file=requirements_file,
+            config_file=None,
+        )
 
         def test_model(self):
             mod = ModelExportUtils.generate_model(self.args)
@@ -151,52 +172,141 @@ def test_model(self):
         def test_manifest_json(self):
             manifest = ModelExportUtils.generate_manifest_json(self.args)
             manifest_json = json.loads(manifest)
-            assert manifest_json['runtime'] == RuntimeType.PYTHON.value
-            assert 'model' in manifest_json
-            assert 'license' not in manifest_json
+            assert manifest_json["runtime"] == RuntimeType.PYTHON.value
+            assert "model" in manifest_json
+            assert "license" not in manifest_json
 
     # noinspection PyClassHasNoInit
     class TestModelNameRegEx:
-
         def test_regex_pass(self):
-            model_names = ['my-awesome-model', 'Aa.model', 'a', 'aA.model', 'a1234.model', 'a-A-A.model', '123-abc']
+            model_names = [
+                "my-awesome-model",
+                "Aa.model",
+                "a",
+                "aA.model",
+                "a1234.model",
+                "a-A-A.model",
+                "123-abc",
+            ]
             for m in model_names:
                 ModelExportUtils.check_model_name_regex_or_exit(m)
 
         def test_regex_fail(self):
-            model_names = ['abc%', '123$abc', 'abc!123', '@123', '(model', 'mdoel)',
-                           '12*model-a.model', '##.model', '-.model']
+            model_names = [
+                "abc%",
+                "123$abc",
+                "abc!123",
+                "@123",
+                "(model",
+                "mdoel)",
+                "12*model-a.model",
+                "##.model",
+                "-.model",
+            ]
             for m in model_names:
                 with pytest.raises(ModelArchiverError):
                     ModelExportUtils.check_model_name_regex_or_exit(m)
 
     # noinspection PyClassHasNoInit
     class TestFileFilter:
-
-        files_to_exclude = {'abc.onnx'}
+        files_to_exclude = {"abc.onnx"}
 
         def test_with_return_false(self):
-            assert ModelExportUtils.file_filter('abc.onnx', self.files_to_exclude) is False
+            assert (
+                ModelExportUtils.file_filter("abc.onnx", self.files_to_exclude) is False
+            )
 
         def test_with_pyc(self):
-            assert ModelExportUtils.file_filter('abc.pyc', self.files_to_exclude) is False
+            assert (
+                ModelExportUtils.file_filter("abc.pyc", self.files_to_exclude) is False
+            )
 
         def test_with_ds_store(self):
-            assert ModelExportUtils.file_filter('.DS_Store', self.files_to_exclude) is False
+            assert (
+                ModelExportUtils.file_filter(".DS_Store", self.files_to_exclude)
+                is False
+            )
 
         def test_with_return_true(self):
-            assert ModelExportUtils.file_filter('abc.mxnet', self.files_to_exclude) is True
+            assert (
+                ModelExportUtils.file_filter("abc.mxnet", self.files_to_exclude) is True
+            )
 
     # noinspection PyClassHasNoInit
     class TestDirectoryFilter:
-
-        unwanted_dirs = {'__MACOSX', '__pycache__'}
+        unwanted_dirs = {"__MACOSX", "__pycache__"}
 
         def test_with_unwanted_dirs(self):
-            assert ModelExportUtils.directory_filter('__MACOSX', self.unwanted_dirs) is False
+            assert (
+                ModelExportUtils.directory_filter("__MACOSX", self.unwanted_dirs)
+                is False
+            )
 
         def test_with_starts_with_dot(self):
-            assert ModelExportUtils.directory_filter('.gitignore', self.unwanted_dirs) is False
+            assert (
+                ModelExportUtils.directory_filter(".gitignore", self.unwanted_dirs)
+                is False
+            )
 
         def test_with_return_true(self):
-            assert ModelExportUtils.directory_filter('my-model', self.unwanted_dirs) is True
+            assert (
+                ModelExportUtils.directory_filter("my-model", self.unwanted_dirs)
+                is True
+            )
+
+
+def create_manifest_from_test_json(test_json):
+    test_ = {k.replace("-", "_"): v for k, v in test_json.items()}
+    test_["requirements_file"] = ""
+    test_["runtime"] = RuntimeType.PYTHON3.value
+    test_["config_file"] = ""
+
+    args = namedtuple("Model", test_.keys())(**test_)
+    manifest = ModelExportUtils.generate_manifest_json(args)
+    return manifest
+
+
+def prepare_model_dir(test_name, integ_tests):
+    integ_tests = list(filter(lambda t: t["name"] == test_name, integ_tests))
+    assert len(integ_tests) == 1
+    test = integ_tests[0]
+
+    keys = (
+        "model-file",
+        "serialized-file",
+        "handler",
+        "extra-files",
+        "config-file",
+    )
+    artifact_files = {k.replace("-", "_"): test[k] for k in keys}
+
+    model_path = ModelExportUtils.copy_artifacts(test["model-name"], **artifact_files)
+
+    manifest = create_manifest_from_test_json(test)
+    return manifest, model_path
+
+
+def test_archive_creation_with_zip_store(tmp_path, integ_tests, mocker):
+    manifest, model_path = prepare_model_dir("packaging_zip_store_mar", integ_tests)
+
+    ModelExportUtils.archive(
+        tmp_path,
+        "zip-store",
+        model_path,
+        manifest,
+        archive_format="zip-store",
+    )
+
+    ModelExportUtils.archive(
+        tmp_path, "zip", model_path, manifest, archive_format="default"
+    )
+
+    stored_size = Path(tmp_path).joinpath("zip-store.mar").stat().st_size
+    zipped_size = Path(tmp_path).joinpath("zip.mar").stat().st_size
+
+    assert zipped_size < stored_size
+
+
+def test_missing_extra_files(integ_tests):
+    with pytest.raises(FileNotFoundError):
+        prepare_model_dir("missing_extra_files", integ_tests)
diff --git a/model-archiver/model_archiver/tests/unit_tests/test_version.py b/model-archiver/model_archiver/tests/unit_tests/test_version.py
index 9571155338..d72e20d66f 100644
--- a/model-archiver/model_archiver/tests/unit_tests/test_version.py
+++ b/model-archiver/model_archiver/tests/unit_tests/test_version.py
@@ -1,15 +1,16 @@
+from pathlib import Path
 
-
-import os
 import model_archiver
 
+MODEL_ARCHIVER_ROOT_DIR = Path(__file__).parent.parent.parent
+
 
 def test_model_export_tool_version():
     """
     Test the model archive version
     :return:
     """
-    with open(os.path.join('model_archiver', 'version.txt')) as f:
+    with open(MODEL_ARCHIVER_ROOT_DIR.joinpath("version.txt")) as f:
         __version__ = f.readline().strip()
 
     assert __version__ == str(model_archiver.__version__), "Versions do not match"
diff --git a/model-archiver/model_archiver/version.txt b/model-archiver/model_archiver/version.txt
index 39e898a4f9..a3df0a6959 100644
--- a/model-archiver/model_archiver/version.txt
+++ b/model-archiver/model_archiver/version.txt
@@ -1 +1 @@
-0.7.1
+0.8.0
diff --git a/mypy.ini b/mypy.ini
index fbc0f98dbc..2e1165bdc1 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,7 +1,7 @@
 [mypy]
 ; A good-first-issue is to add types to a file
 ; As you do start adding them in files and slowly make the excluded files empty
-files = ts/context.py
+files = ts/context.py, ts/model_server.py
 
 exclude = examples, binaries, ts_scripts, test, kubernetes, benchmarks, model-archiver, workflow-archiver, ts/tests, ts/utils
 
diff --git a/requirements/common.txt b/requirements/common.txt
index 43323ac93c..57b0bd579f 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,12 +1,6 @@
-psutil
-cython
-wheel
-requests
-pillow==9.3.0
-captum
-packaging
-numpy; sys_platform != 'win32'
-numpy==1.19.3; sys_platform == 'win32' #see https://tinyurl.com/y3dm3h86
-nvgpu; sys_platform != 'win32'
-nvgpu==0.8.0; sys_platform == 'win32'
-pyyaml
+psutil==5.9.5
+requests==2.31.0
+captum==0.6.0
+packaging==23.1
+pynvml==11.4.1
+pyyaml==6.0
diff --git a/requirements/common_gpu.txt b/requirements/common_gpu.txt
new file mode 100644
index 0000000000..18037b19bf
--- /dev/null
+++ b/requirements/common_gpu.txt
@@ -0,0 +1,2 @@
+nvgpu; sys_platform != 'win32'
+nvgpu==0.8.0; sys_platform == 'win32'
diff --git a/requirements/developer.txt b/requirements/developer.txt
index d497ece393..be985a4519 100644
--- a/requirements/developer.txt
+++ b/requirements/developer.txt
@@ -1,17 +1,19 @@
 -r common.txt
-mock
-pytest
+mock==5.0.2
+pytest==7.3.1
 pylint==2.6.0
-pytest-mock
-pytest-cov
-grpcio
-protobuf
-grpcio-tools
-transformers==4.25.1
-pyspelling
-pygit2
-pyspelling
-pre-commit
-twine
-mypy
-intel_extension_for_pytorch; sys_platform != 'win32' and sys_platform != 'darwin'
+pytest-mock==3.10.0
+pytest-cov==4.1.0
+grpcio==1.54.2
+protobuf==4.23.1
+grpcio-tools==1.54.2
+transformers==4.30.0
+pyspelling==2.8.2
+pygit2==1.12.1
+pre-commit==3.3.2
+twine==4.0.2
+mypy==1.3.0
+torchpippy==0.1.1
+intel_extension_for_pytorch==2.0.100; sys_platform != 'win32' and sys_platform != 'darwin'
+onnxruntime==1.15.0
+onnx==1.14.0
diff --git a/requirements/production.txt b/requirements/production.txt
deleted file mode 100644
index c3899b0c47..0000000000
--- a/requirements/production.txt
+++ /dev/null
@@ -1 +0,0 @@
--r common.txt
\ No newline at end of file
diff --git a/requirements/torch_common.txt b/requirements/torch_common.txt
new file mode 100644
index 0000000000..1518b7ab8f
--- /dev/null
+++ b/requirements/torch_common.txt
@@ -0,0 +1,5 @@
+cython==0.29.34
+wheel==0.40.0
+pillow==9.3.0
+numpy==1.24.3; sys_platform != 'win32'
+numpy==1.19.3; sys_platform == 'win32' #see https://tinyurl.com/y3dm3h86
diff --git a/requirements/torch_cu101_linux.txt b/requirements/torch_cu101_linux.txt
index c28839eb44..1c931e7797 100644
--- a/requirements/torch_cu101_linux.txt
+++ b/requirements/torch_cu101_linux.txt
@@ -1,8 +1,6 @@
 #pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.8.1+cu101; sys_platform == 'linux'
 torchvision==0.9.1+cu101; sys_platform == 'linux'
 torchtext==0.9.1; sys_platform == 'linux'
diff --git a/requirements/torch_cu101_windows.txt b/requirements/torch_cu101_windows.txt
index 67f7b0a89d..28acba4d8a 100644
--- a/requirements/torch_cu101_windows.txt
+++ b/requirements/torch_cu101_windows.txt
@@ -1,6 +1,7 @@
 #pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
+-r torch_common.txt
 torch==1.8.1+cu101; sys_platform == 'win32'
 torchvision==0.9.1+cu101; sys_platform == 'win32'
 torchtext==0.9.1; sys_platform == 'win32'
-torchaudio==0.8.1; sys_platform == 'win32'
\ No newline at end of file
+torchaudio==0.8.1; sys_platform == 'win32'
diff --git a/requirements/torch_cu102_linux.txt b/requirements/torch_cu102_linux.txt
index 7e67fe83e7..3bb1cb3a01 100644
--- a/requirements/torch_cu102_linux.txt
+++ b/requirements/torch_cu102_linux.txt
@@ -1,7 +1,5 @@
 # pip install torch torchvision torchaudio
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.12.0; sys_platform == 'linux'
 torchvision==0.13.0; sys_platform == 'linux'
 torchtext==0.13.0; sys_platform == 'linux'
diff --git a/requirements/torch_cu102_windows.txt b/requirements/torch_cu102_windows.txt
index 51cdc6092e..fce4f82566 100644
--- a/requirements/torch_cu102_windows.txt
+++ b/requirements/torch_cu102_windows.txt
@@ -1,6 +1,7 @@
 #pip install torch===1.9.0 torchvision===0.10.0 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
+-r torch_common.txt
 torch==1.10.0+cu102; sys_platform == 'win32'
 torchvision==0.11.1+cu102; sys_platform == 'win32'
 torchtext==0.11.0; sys_platform == 'win32'
-torchaudio==0.10.0+cu102; sys_platform == 'win32'
\ No newline at end of file
+torchaudio==0.10.0+cu102; sys_platform == 'win32'
diff --git a/requirements/torch_cu111_linux.txt b/requirements/torch_cu111_linux.txt
index c15e853a6c..40348b24cb 100644
--- a/requirements/torch_cu111_linux.txt
+++ b/requirements/torch_cu111_linux.txt
@@ -1,8 +1,6 @@
 #pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.9.0+cu111; sys_platform == 'linux'
 torchvision==0.10.0+cu111; sys_platform == 'linux'
 torchtext==0.10.0; sys_platform == 'linux'
diff --git a/requirements/torch_cu111_windows.txt b/requirements/torch_cu111_windows.txt
index 1d6151de13..7c2a32d87e 100644
--- a/requirements/torch_cu111_windows.txt
+++ b/requirements/torch_cu111_windows.txt
@@ -1,5 +1,6 @@
 #pip install torch===1.9.0+cu111 torchvision===0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
+-r torch_common.txt
 torch==1.9.0+cu111; sys_platform == 'win32'
 torchvision==0.10.0+cu111; sys_platform == 'win32'
 torchtext==0.10.0; sys_platform == 'win32'
diff --git a/requirements/torch_cu113_linux.txt b/requirements/torch_cu113_linux.txt
index e072fe1b94..ce966cfac7 100644
--- a/requirements/torch_cu113_linux.txt
+++ b/requirements/torch_cu113_linux.txt
@@ -1,8 +1,6 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 --extra-index-url https://download.pytorch.org/whl/cu113
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.12.0+cu113; sys_platform == 'linux'
 torchvision==0.13.0+cu113; sys_platform == 'linux'
 torchtext==0.13.0; sys_platform == 'linux'
diff --git a/requirements/torch_cu113_windows.txt b/requirements/torch_cu113_windows.txt
index a81d81016f..19731e99db 100644
--- a/requirements/torch_cu113_windows.txt
+++ b/requirements/torch_cu113_windows.txt
@@ -1,5 +1,6 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 --extra-index-url https://download.pytorch.org/whl/cu113
+-r torch_common.txt
 torch==1.12.0+cu113; sys_platform == 'win32'
 torchvision==0.13.0+cu113; sys_platform == 'win32'
 torchtext==0.13.0; sys_platform == 'win32'
diff --git a/requirements/torch_cu116_linux.txt b/requirements/torch_cu116_linux.txt
index 05a814c546..702f3bae24 100644
--- a/requirements/torch_cu116_linux.txt
+++ b/requirements/torch_cu116_linux.txt
@@ -1,8 +1,6 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
 --extra-index-url https://download.pytorch.org/whl/cu116
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.13.1+cu116; sys_platform == 'linux'
 torchvision==0.14.1+cu116; sys_platform == 'linux'
 torchtext==0.14.1; sys_platform == 'linux'
diff --git a/requirements/torch_cu116_windows.txt b/requirements/torch_cu116_windows.txt
index 10b0c7db7c..92a7d98f9e 100644
--- a/requirements/torch_cu116_windows.txt
+++ b/requirements/torch_cu116_windows.txt
@@ -1,5 +1,6 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
 --extra-index-url https://download.pytorch.org/whl/cu116
+-r torch_common.txt
 torch==1.13.1+cu116; sys_platform == 'win32'
 torchvision==0.14.1+cu116; sys_platform == 'win32'
 torchtext==0.14.1; sys_platform == 'win32'
diff --git a/requirements/torch_cu117_linux.txt b/requirements/torch_cu117_linux.txt
index 083c68ecff..ebfa68c47c 100644
--- a/requirements/torch_cu117_linux.txt
+++ b/requirements/torch_cu117_linux.txt
@@ -1,9 +1,7 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
 --extra-index-url https://download.pytorch.org/whl/cu117
-cython
-wheel
-pillow==9.3.0
-torch==1.13.1+cu117; sys_platform == 'linux'
-torchvision==0.14.1+cu117; sys_platform == 'linux'
-torchtext==0.14.1; sys_platform == 'linux'
-torchaudio==0.13.1+cu117; sys_platform == 'linux'
+-r torch_common.txt
+torch==2.0.1+cu117; sys_platform == 'linux'
+torchvision==0.15.2+cu117; sys_platform == 'linux'
+torchtext==0.15.2; sys_platform == 'linux'
+torchaudio==2.0.2+cu117; sys_platform == 'linux'
diff --git a/requirements/torch_cu117_windows.txt b/requirements/torch_cu117_windows.txt
index c51f59d627..03debbb6e5 100644
--- a/requirements/torch_cu117_windows.txt
+++ b/requirements/torch_cu117_windows.txt
@@ -1,6 +1,7 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
 --extra-index-url https://download.pytorch.org/whl/cu117
-torch==1.13.1+cu117; sys_platform == 'win32'
-torchvision==0.14.1+cu117; sys_platform == 'win32'
-torchtext==0.14.1; sys_platform == 'win32'
-torchaudio==0.13.1+cu117; sys_platform == 'win32'
+-r torch_common.txt
+torch==2.0.1+cu117; sys_platform == 'win32'
+torchvision==0.15.2+cu117; sys_platform == 'win32'
+torchtext==0.15.2; sys_platform == 'win32'
+torchaudio==2.0.2+cu117; sys_platform == 'win32'
diff --git a/requirements/torch_cu118_linux.txt b/requirements/torch_cu118_linux.txt
new file mode 100644
index 0000000000..d34969ef55
--- /dev/null
+++ b/requirements/torch_cu118_linux.txt
@@ -0,0 +1,7 @@
+#pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
+--extra-index-url https://download.pytorch.org/whl/cu118
+-r torch_common.txt
+torch==2.0.1+cu118; sys_platform == 'linux'
+torchvision==0.15.2+cu118; sys_platform == 'linux'
+torchtext==0.15.2; sys_platform == 'linux'
+torchaudio==2.0.2+cu118; sys_platform == 'linux'
diff --git a/requirements/torch_cu118_windows.txt b/requirements/torch_cu118_windows.txt
new file mode 100644
index 0000000000..d34969ef55
--- /dev/null
+++ b/requirements/torch_cu118_windows.txt
@@ -0,0 +1,7 @@
+#pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
+--extra-index-url https://download.pytorch.org/whl/cu118
+-r torch_common.txt
+torch==2.0.1+cu118; sys_platform == 'linux'
+torchvision==0.15.2+cu118; sys_platform == 'linux'
+torchtext==0.15.2; sys_platform == 'linux'
+torchaudio==2.0.2+cu118; sys_platform == 'linux'
diff --git a/requirements/torch_cu92.txt b/requirements/torch_cu92.txt
index 3ab1f411c9..3867de905e 100644
--- a/requirements/torch_cu92.txt
+++ b/requirements/torch_cu92.txt
@@ -1,4 +1,5 @@
+-r torch_common.txt
 torch==1.8.1+cu92
 torchvision==0.9.1+cu92
 torchtext==0.9.1
-torchaudio==0.8.1
\ No newline at end of file
+torchaudio==0.8.1
diff --git a/requirements/torch_cu92_linux.txt b/requirements/torch_cu92_linux.txt
index 69765992ec..f93cb590c6 100644
--- a/requirements/torch_cu92_linux.txt
+++ b/requirements/torch_cu92_linux.txt
@@ -1,8 +1,6 @@
 #pip install torch==1.8.1+cu92 torchvision==0.9.1+cu92 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
 --find-links https://download.pytorch.org/whl/torch_stable.html
-cython
-wheel
-pillow==9.3.0
+-r torch_common.txt
 torch==1.8.1+cu92; sys_platform == 'linux'
 torchvision==0.9.1+cu92; sys_platform == 'linux'
 torchtext==0.9.1; sys_platform == 'linux'
diff --git a/requirements/torch_darwin.txt b/requirements/torch_darwin.txt
index 9cc30c8440..0e3ff64f98 100644
--- a/requirements/torch_darwin.txt
+++ b/requirements/torch_darwin.txt
@@ -1,5 +1,6 @@
 #pip install torch torchvision torchaudio
-torch==1.13.1; sys_platform == 'darwin'
-torchvision==0.14.1; sys_platform == 'darwin'
-torchtext==0.14.1; sys_platform == 'darwin'
-torchaudio==0.13.1; sys_platform == 'darwin'
+-r torch_common.txt
+torch==2.0.1; sys_platform == 'darwin'
+torchvision==0.15.2; sys_platform == 'darwin'
+torchtext==0.15.2; sys_platform == 'darwin'
+torchaudio==2.0.2; sys_platform == 'darwin'
diff --git a/requirements/torch_linux.txt b/requirements/torch_linux.txt
index 116ce16686..96424b9ca4 100644
--- a/requirements/torch_linux.txt
+++ b/requirements/torch_linux.txt
@@ -1,9 +1,7 @@
 #pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 --extra-index-url https://download.pytorch.org/whl/cpu
-cython
-wheel
-pillow==9.3.0
-torch==1.13.1+cpu; sys_platform == 'linux'
-torchvision==0.14.1+cpu; sys_platform == 'linux'
-torchtext==0.14.1; sys_platform == 'linux'
-torchaudio==0.13.1+cpu; sys_platform == 'linux'
+-r torch_common.txt
+torch==2.0.1+cpu; sys_platform == 'linux'
+torchvision==0.15.2+cpu; sys_platform == 'linux'
+torchtext==0.15.2; sys_platform == 'linux'
+torchaudio==2.0.2+cpu; sys_platform == 'linux'
diff --git a/requirements/torch_windows.txt b/requirements/torch_windows.txt
index 7552033ba3..a9ee0f348d 100644
--- a/requirements/torch_windows.txt
+++ b/requirements/torch_windows.txt
@@ -1,7 +1,6 @@
 #pip install torch torchvision torchaudio
-wheel
-torch==1.13.1; sys_platform == 'win32'
-torchvision==0.14.1; sys_platform == 'win32'
-torchtext==0.14.1; sys_platform == 'win32'
-torchaudio==0.13.1; sys_platform == 'win32'
-pillow==9.3.0
+-r torch_common.txt
+torch==2.0.1; sys_platform == 'win32'
+torchvision==0.15.2; sys_platform == 'win32'
+torchtext==0.15.2; sys_platform == 'win32'
+torchaudio==2.0.2; sys_platform == 'win32'
diff --git a/serving-sdk/src/main/java/org/pytorch/serve/servingsdk/metrics/InbuiltMetricsRegistry.java b/serving-sdk/src/main/java/org/pytorch/serve/servingsdk/metrics/InbuiltMetricsRegistry.java
index b98a4baeef..4903ff7417 100644
--- a/serving-sdk/src/main/java/org/pytorch/serve/servingsdk/metrics/InbuiltMetricsRegistry.java
+++ b/serving-sdk/src/main/java/org/pytorch/serve/servingsdk/metrics/InbuiltMetricsRegistry.java
@@ -10,6 +10,7 @@
 public class InbuiltMetricsRegistry {
     public static final String INFERENCEREQUESTS = "InferenceRequests";
     public static final String QUEUETIME = "QueueTime";
+    public static final String REQUESTPRIORITY = "RequestPriority";
     public static final String BACKENDRESPONSETIME = "BackendResponseTime";
     public static final String HANDLERTIME = "HandlerTime";
     public static final String WORKERTHREADTIME = "WorkerThreadTime";
diff --git a/test/postman/explanation_api_test_collection.json b/test/postman/explanation_api_test_collection.json
index 042f284eb8..c0f5f9cabc 100644
--- a/test/postman/explanation_api_test_collection.json
+++ b/test/postman/explanation_api_test_collection.json
@@ -192,7 +192,7 @@
 						"id": "0ebdb1a8-dce9-4c2e-81cf-116a30003ac8",
 						"exec": [
 							"pm.test(\"Successful GET request\", function () {",
-							"    let pattern = new RegExp('ts_inference_latency_microseconds{uuid=\"[\\\\w]{8}(-[\\\\w]{4}){3}-[\\\\w]{12}\",model_name=\"'+pm.variables.get(\"model_name\"));",
+							"    let pattern = new RegExp('.*Requests2XX\\\\{Level=\"Host\",Hostname=\".*\",\\\\} \\\\d+\\\\.\\\\d+.*');",
 							"    pm.expect(pm.response.text()).to.match(pattern);",
 							"});"
 						],
@@ -219,4 +219,4 @@
 		}
 	],
 	"protocolProfileBehavior": {}
-}
\ No newline at end of file
+}
diff --git a/test/postman/inference_api_test_collection.json b/test/postman/inference_api_test_collection.json
index 6c17aec684..11e8f3a4e3 100644
--- a/test/postman/inference_api_test_collection.json
+++ b/test/postman/inference_api_test_collection.json
@@ -208,7 +208,7 @@
                             "id": "9605fe7f-9911-495a-b6c3-b495e1b1dd21",
                             "exec": [
                                 "pm.test(\"Successful GET request\", function () {",
-                                "    let pattern = new RegExp('ts_inference_latency_microseconds{uuid=\"[\\\\w]{8}(-[\\\\w]{4}){3}-[\\\\w]{12}\",model_name=\"'+pm.variables.get(\"model_name\"));",
+								"    let pattern = new RegExp('.*Requests2XX\\\\{Level=\"Host\",Hostname=\".*\",\\\\} \\\\d+\\\\.\\\\d+.*');",
                                 "    pm.expect(pm.response.text()).to.match(pattern);",
                                 "});"
                             ],
@@ -235,4 +235,4 @@
             }
 	],
     "protocolProfileBehavior": {}
-}
\ No newline at end of file
+}
diff --git a/test/postman/inference_stream_data.json b/test/postman/inference_stream_data.json
new file mode 100644
index 0000000000..94f598ef27
--- /dev/null
+++ b/test/postman/inference_stream_data.json
@@ -0,0 +1,13 @@
+[
+    {
+        "url":"https://torchserve.pytorch.org/mar_files/echo_stream.mar",
+        "model_name":"echo_stream",
+        "worker":1,
+        "synchronous":"true",
+        "file":"../examples/text_classification/sample_text.txt",
+        "content-type":"text/plain",
+        "validator":"image_classification",
+        "expected": "hello hello hello hello world ",
+        "tolerance":5
+    }
+]
diff --git a/test/postman/kf_api_test_collection.json b/test/postman/kf_api_test_collection.json
index ff9fcefbeb..3c29e877b1 100644
--- a/test/postman/kf_api_test_collection.json
+++ b/test/postman/kf_api_test_collection.json
@@ -304,7 +304,7 @@
 						"id": "6bb26bdb-6069-48fa-b270-9ba694ad1f35",
 						"exec": [
 							"pm.test(\"Successful GET request\", function () {",
-							"    let pattern = new RegExp('ts_inference_latency_microseconds{uuid=\"[\\\\w]{8}(-[\\\\w]{4}){3}-[\\\\w]{12}\",model_name=\"'+pm.variables.get(\"model_name\"));",
+							"    let pattern = new RegExp('.*Requests2XX\\\\{Level=\"Host\",Hostname=\".*\",\\\\} \\\\d+\\\\.\\\\d+.*');",
 							"    pm.expect(pm.response.text()).to.match(pattern);",
 							"});"
 						],
@@ -331,4 +331,4 @@
 		}
 	],
 	"protocolProfileBehavior": {}
-}
\ No newline at end of file
+}
diff --git a/test/postman/kf_inference_api_test_collection.json b/test/postman/kf_inference_api_test_collection.json
index 8e6453de2d..10e09b5f71 100644
--- a/test/postman/kf_inference_api_test_collection.json
+++ b/test/postman/kf_inference_api_test_collection.json
@@ -323,7 +323,7 @@
 						"id": "e6253ef2-c614-492f-ac2b-12da8b8e92fc",
 						"exec": [
 							"pm.test(\"Successful GET request\", function () {",
-							"    let pattern = new RegExp('ts_inference_latency_microseconds{uuid=\"[\\\\w]{8}(-[\\\\w]{4}){3}-[\\\\w]{12}\",model_name=\"'+pm.variables.get(\"model_name\"));",
+							"    let pattern = new RegExp('.*Requests2XX\\\\{Level=\"Host\",Hostname=\".*\",\\\\} \\\\d+\\\\.\\\\d+.*');",
 							"    pm.expect(pm.response.text()).to.match(pattern);",
 							"});"
 						],
@@ -350,4 +350,4 @@
 		}
 	],
 	"protocolProfileBehavior": {}
-}
\ No newline at end of file
+}
diff --git a/test/postman/kfv2_api_test_collection.json b/test/postman/kfv2_api_test_collection.json
index f89dd92211..c0e64c15aa 100644
--- a/test/postman/kfv2_api_test_collection.json
+++ b/test/postman/kfv2_api_test_collection.json
@@ -295,7 +295,7 @@
 					"script": {
 						"exec": [
 							"pm.test(\"Successful GET request\", function () {",
-							"    let pattern = new RegExp('ts_inference_latency_microseconds{uuid=\"[\\\\w]{8}(-[\\\\w]{4}){3}-[\\\\w]{12}\",model_name=\"'+pm.variables.get(\"model_name\"));",
+							"    let pattern = new RegExp('.*Requests2XX\\\\{Level=\"Host\",Hostname=\".*\",\\\\} \\\\d+\\\\.\\\\d+.*');",
 							"    pm.expect(pm.response.text()).to.match(pattern);",
 							"});"
 						],
@@ -321,4 +321,4 @@
 			"response": []
 		}
 	]
-}
\ No newline at end of file
+}
diff --git a/test/postman/kfv2_inference_data.json b/test/postman/kfv2_inference_data.json
index f0ad09d2d0..e00c715450 100644
--- a/test/postman/kfv2_inference_data.json
+++ b/test/postman/kfv2_inference_data.json
@@ -6,10 +6,10 @@
    "file": "../kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json",
    "content-type": "application/json",
    "expected": {
-      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"predict","shape":[],"datatype":"INT64","data":[1]}]
+      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"input-0","shape":[],"datatype":"INT64","data":[1]}]
    },
    "expected_explain": {
-      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"explain","shape":[1,28,28],"datatype":"FP64","data":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0040547527881586954,-0.00022612877132135935,-0.00012734132068921815,0.005648369123934234,0.00890478344415316,0.002638536746843638,0.0026802459473054567,-0.002657801646198628,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0002446577521584037,0.0008218454252870746,0.015285916556975589,0.007512832032495784,0.007094984582680408,0.003405668414819342,-0.0020919248349481525,-0.0007800296083653554,0.022995877395463753,0.019004328861537745,-0.0012529557611487667,-0.0014666116853554992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.005298396299742967,-0.0007901602589731957,0.00390606628994132,0.02317408192562863,0.01723791734244863,0.010867034230381416,0.003001563449593484,0.006224217749113618,0.006120711993702211,0.016736329208148985,0.005674718979287411,0.0043441351074201716,-0.0012328422456581033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0006867354470939666,0.009772898561731134,-0.003875493029617137,0.0017986933105143274,0.00130754408083684,-0.0024510981201440493,-0.0008806773035242951,0,0,-0.00014277890938077845,-0.009322312923101268,0.020608317831970053,0.0043513950202448085,-0.0007875567959471073,-0.0009075897498983682,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00022247236805959426,-0.0007829029576392685,0.0026663695298724034,0.000973336645392922,0,0,0,0,0,0,0,0.0004323206544010433,0.023657171718451487,0.010694845123018274,-0.0023759529649896504,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.002074797027562978,-0.0023201009712006193,-0.0012899209165390638,0,0,0,0,0,0,0,0,0,0.007629679307476711,0.010448627340902272,0.00025032896574585353,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0003770835815454417,-0.005156369326824804,0.0012477581647151723,0,0,0,0,0,0,0,0,0,-0.00004442522927758585,0.010248046478304183,0.0009971132925931643,0,0,0,0,0,0,0,0,0,0,0,0,0.0004501049686186689,-0.001963053861562753,-0.0006664790954766908,0.0020157404181157298,0,0,0,0,0,0,0,0,0,-0.0022144570001665495,0.008361584182210209,0.0031401945811928064,0,0,0,0,0,0,0,0,0,0,0,0,-0.0028943546389954404,-0.0031301382952544582,0.002113252627152244,0,0,0,0,0,0,0,0,0,0,-0.0010321050313140568,0.008905753962245818,0.0028464382842652274,0,0,0,0,0,0,0,0,0,0,0,0,-0.005305289160784239,-0.001927110161077484,0.0012090041616218117,0,0,0,0,0,0,0,0,0,0,-0.0011945155110826835,0.005654443253323257,0.0020132074296893847,0,0,0,0,0,0,0,0,0,0,0,0,-0.0014689358191145255,0.00107434126494373,0,0,0,0,0,0,0,0,0,0,0,-0.0017047979656755515,0.002906605326916773,-0.0007805868832212293,0,0,0,0,0,0,0,0,0,0,0,0.000055417251836277426,0.0014516115955483288,0.0002827699382308426,0,0,0,0,0,0,0,0,0,0,0,-0.0014401406798288333,0.002381249994012627,0.002146825485493657,0,0,0,0,0,0,0,0,0,0,0,0.0011500530011764514,0.00028650115062629793,0.0029798149728837,0,0,0,0,0,0,0,0,0,0,0,-0.0017750294246144378,0.0008339858039134471,-0.0037707389974128264,0,0,0,0,0,0,0,0,0,0,0,-0.0006093176702196316,-0.0004690580448827246,0.0034053215399203448,0,0,0,0,0,0,0,0,0,0,-0.0007450010561445004,0.0012987672807208413,-0.00849924754154327,-0.00006145174356975924,0,0,0,0,0,0,0,0,0,0,0,0.0011809727047705845,-0.0018384766530189604,0.005411106767295053,0,0,0,0,0,0,0,0,0,-0.0021392342405935397,0.0003259162378301207,-0.005276118419877435,-0.001950983939698961,-9.545680860124795e-7,0,0,0,0,0,0,0,0,0,0,0,0.000777240560389088,-0.00015179538793786839,0.006481484638650515,0,0,0,0,0,0,0,0,0.00008098065166629173,-0.0024904261335704243,-0.0020718616274916063,-0.00005341157801587443,-0.00045564727357325394,0,0,0,0,0,0,0,0,0,0,0,0,0,0.002275098238597264,0.0017164058060623701,0.00032213445581197173,0,0,0,0,0,-0.001556028266851665,0.0000910724863950236,0.0008772840524484654,0.000650298006504863,-0.004128780934527031,0.0006030386677594234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0013959957755626813,0.00267915270212672,0.0023995009632858484,-0.0004496094979322396,0.003101832911668704,0.007494535603697501,0.002864118701309854,-0.003052590375330078,0.003420222741405451,0.001492401842506996,-0.0009357391552120744,0.0007856228750089005,-0.00184339736789655,0.00001603187900317098,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0006999018662842894,0.004382251035718981,-0.0035419315151426845,-0.002889674705246964,-0.000487345313107622,-0.006087344960098864,0.0003882250941768635,0.0025336419028892817,-0.004352836272916637,-0.0006079418201851047,-0.003810133084711927,-0.0008284412435870998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0010901530193446261,-0.013135007265412056,0.000473452169279359,0.002050423312678761,-0.00660945214953636,0.00236478632058849,0.004678920566995346,-0.0018122525188342855,0.002137538293354298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}]
+      "id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","model_name":"mnist","model_version":"1.0","outputs":[{"name":"input-0","shape":[1,28,28],"datatype":"FP64","data":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0040547527881586954,-0.00022612877132135935,-0.00012734132068921815,0.005648369123934234,0.00890478344415316,0.002638536746843638,0.0026802459473054567,-0.002657801646198628,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0002446577521584037,0.0008218454252870746,0.015285916556975589,0.007512832032495784,0.007094984582680408,0.003405668414819342,-0.0020919248349481525,-0.0007800296083653554,0.022995877395463753,0.019004328861537745,-0.0012529557611487667,-0.0014666116853554992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.005298396299742967,-0.0007901602589731957,0.00390606628994132,0.02317408192562863,0.01723791734244863,0.010867034230381416,0.003001563449593484,0.006224217749113618,0.006120711993702211,0.016736329208148985,0.005674718979287411,0.0043441351074201716,-0.0012328422456581033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0006867354470939666,0.009772898561731134,-0.003875493029617137,0.0017986933105143274,0.00130754408083684,-0.0024510981201440493,-0.0008806773035242951,0,0,-0.00014277890938077845,-0.009322312923101268,0.020608317831970053,0.0043513950202448085,-0.0007875567959471073,-0.0009075897498983682,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00022247236805959426,-0.0007829029576392685,0.0026663695298724034,0.000973336645392922,0,0,0,0,0,0,0,0.0004323206544010433,0.023657171718451487,0.010694845123018274,-0.0023759529649896504,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.002074797027562978,-0.0023201009712006193,-0.0012899209165390638,0,0,0,0,0,0,0,0,0,0.007629679307476711,0.010448627340902272,0.00025032896574585353,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0003770835815454417,-0.005156369326824804,0.0012477581647151723,0,0,0,0,0,0,0,0,0,-0.00004442522927758585,0.010248046478304183,0.0009971132925931643,0,0,0,0,0,0,0,0,0,0,0,0,0.0004501049686186689,-0.001963053861562753,-0.0006664790954766908,0.0020157404181157298,0,0,0,0,0,0,0,0,0,-0.0022144570001665495,0.008361584182210209,0.0031401945811928064,0,0,0,0,0,0,0,0,0,0,0,0,-0.0028943546389954404,-0.0031301382952544582,0.002113252627152244,0,0,0,0,0,0,0,0,0,0,-0.0010321050313140568,0.008905753962245818,0.0028464382842652274,0,0,0,0,0,0,0,0,0,0,0,0,-0.005305289160784239,-0.001927110161077484,0.0012090041616218117,0,0,0,0,0,0,0,0,0,0,-0.0011945155110826835,0.005654443253323257,0.0020132074296893847,0,0,0,0,0,0,0,0,0,0,0,0,-0.0014689358191145255,0.00107434126494373,0,0,0,0,0,0,0,0,0,0,0,-0.0017047979656755515,0.002906605326916773,-0.0007805868832212293,0,0,0,0,0,0,0,0,0,0,0,0.000055417251836277426,0.0014516115955483288,0.0002827699382308426,0,0,0,0,0,0,0,0,0,0,0,-0.0014401406798288333,0.002381249994012627,0.002146825485493657,0,0,0,0,0,0,0,0,0,0,0,0.0011500530011764514,0.00028650115062629793,0.0029798149728837,0,0,0,0,0,0,0,0,0,0,0,-0.0017750294246144378,0.0008339858039134471,-0.0037707389974128264,0,0,0,0,0,0,0,0,0,0,0,-0.0006093176702196316,-0.0004690580448827246,0.0034053215399203448,0,0,0,0,0,0,0,0,0,0,-0.0007450010561445004,0.0012987672807208413,-0.00849924754154327,-0.00006145174356975924,0,0,0,0,0,0,0,0,0,0,0,0.0011809727047705845,-0.0018384766530189604,0.005411106767295053,0,0,0,0,0,0,0,0,0,-0.0021392342405935397,0.0003259162378301207,-0.005276118419877435,-0.001950983939698961,-9.545680860124795e-7,0,0,0,0,0,0,0,0,0,0,0,0.000777240560389088,-0.00015179538793786839,0.006481484638650515,0,0,0,0,0,0,0,0,0.00008098065166629173,-0.0024904261335704243,-0.0020718616274916063,-0.00005341157801587443,-0.00045564727357325394,0,0,0,0,0,0,0,0,0,0,0,0,0,0.002275098238597264,0.0017164058060623701,0.00032213445581197173,0,0,0,0,0,-0.001556028266851665,0.0000910724863950236,0.0008772840524484654,0.000650298006504863,-0.004128780934527031,0.0006030386677594234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0013959957755626813,0.00267915270212672,0.0023995009632858484,-0.0004496094979322396,0.003101832911668704,0.007494535603697501,0.002864118701309854,-0.003052590375330078,0.003420222741405451,0.001492401842506996,-0.0009357391552120744,0.0007856228750089005,-0.00184339736789655,0.00001603187900317098,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0006999018662842894,0.004382251035718981,-0.0035419315151426845,-0.002889674705246964,-0.000487345313107622,-0.006087344960098864,0.0003882250941768635,0.0025336419028892817,-0.004352836272916637,-0.0006079418201851047,-0.003810133084711927,-0.0008284412435870998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0010901530193446261,-0.013135007265412056,0.000473452169279359,0.002050423312678761,-0.00660945214953636,0.00236478632058849,0.004678920566995346,-0.0018122525188342855,0.002137538293354298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}]
    },
    "tolerance": 25
-}]
\ No newline at end of file
+}]
diff --git a/test/postman/large_model_inference_data.json b/test/postman/large_model_inference_data.json
new file mode 100644
index 0000000000..f01cad9b59
--- /dev/null
+++ b/test/postman/large_model_inference_data.json
@@ -0,0 +1,18 @@
+[
+    {
+        "url":"https://torchserve.s3.amazonaws.com/mar_files/opt-pippy.tar.gz",
+        "model_name":"opt",
+        "worker":1,
+        "synchronous":"true",
+        "file":"../examples/large_models/Huggingface_pippy/sample_text.txt",
+        "expected": "Hey, are you conscious? Can you talk to me?\nailabi lity"
+    },
+    {
+        "url":"https://torchserve.s3.amazonaws.com/mar_files/opt-ds.tar.gz",
+        "model_name":"opt",
+        "worker":1,
+        "synchronous":"true",
+        "file":"../examples/large_models/deepspeed/opt/sample_text.txt",
+        "expected": "Today the weather is really nice and I am planning on\n\n\nI am planning on the next day.\n\nI am planning on the next day.\n\nI am planning on the next day.\nI am planning on the next"
+    }
+]
diff --git a/test/pytest/conftest.py b/test/pytest/conftest.py
index 7a72a0f34d..6b16b5a6e8 100644
--- a/test/pytest/conftest.py
+++ b/test/pytest/conftest.py
@@ -13,6 +13,7 @@
 collect_ignore = []
 collect_ignore.append("test_example_torchrec_dlrm.py")
 collect_ignore.append("test_example_near_real_time_video.py")
+collect_ignore.append("test_dali_preprocess.py")
 
 
 @pytest.fixture(scope="module")
diff --git a/test/pytest/test_dali_preprocess.py b/test/pytest/test_dali_preprocess.py
new file mode 100644
index 0000000000..55fbfe5718
--- /dev/null
+++ b/test/pytest/test_dali_preprocess.py
@@ -0,0 +1,66 @@
+"""
+Unit test for near real-time video example
+"""
+import os
+from pathlib import Path
+
+import pytest
+import requests
+
+from examples.nvidia_dali.custom_handler import DALIHandler
+from ts.torch_handler.unit_tests.test_utils.mock_context import MockContext
+
+CURR_FILE_PATH = Path(__file__).parent
+REPO_ROOT_DIR = CURR_FILE_PATH.parent.parent
+
+
+EXAMPLE_ROOT_DIR_DALI = REPO_ROOT_DIR.joinpath("examples", "nvidia_dali")
+
+MODEL_PTH_FILE = "resnet18-f37072fd.pth"
+
+EXAMPLE_ROOT_DIR_RESNET = REPO_ROOT_DIR.joinpath(
+    "examples", "image_classifier", "resnet_18"
+)
+
+EXPECTED_RESULTS = [["tabby", "tiger_cat", "Egyptian_cat", "lynx", "bucket"]]
+TEST_CASES = [
+    ("kitten.jpg", EXPECTED_RESULTS[0]),
+]
+
+
+@pytest.mark.parametrize(("file", "expected_result"), TEST_CASES)
+def test_dali_preprocess(monkeypatch, file, expected_result):
+
+    monkeypatch.syspath_prepend(EXAMPLE_ROOT_DIR_RESNET)
+
+    serialized_file = os.path.join(REPO_ROOT_DIR, MODEL_PTH_FILE)
+    if not os.path.exists(serialized_file):
+        response = requests.get(
+            "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+            allow_redirects=True,
+        )
+        assert response.status_code == 200
+        with open(serialized_file, "wb") as f:
+            f.write(response.content)
+
+    handler = DALIHandler()
+    ctx = MockContext(
+        model_pt_file=REPO_ROOT_DIR.joinpath(MODEL_PTH_FILE).as_posix(),
+        model_dir=EXAMPLE_ROOT_DIR_DALI,
+        model_file="model.py",
+    )
+
+    handler.initialize(ctx)
+    data = {}
+
+    with open(Path(CURR_FILE_PATH) / "test_data" / file, "rb") as image:
+        image_file = image.read()
+        byte_array_type = bytearray(image_file)
+        data["body"] = byte_array_type
+
+    x = handler.preprocess([data])
+    x = handler.inference(x)
+    x = handler.postprocess(x)
+    labels = list(x[0].keys())
+
+    assert labels == expected_result
diff --git a/test/pytest/test_data/kitten.jpg b/test/pytest/test_data/kitten.jpg
new file mode 100644
index 0000000000..ffcd2be2c6
Binary files /dev/null and b/test/pytest/test_data/kitten.jpg differ
diff --git a/test/pytest/test_data/torch_compile/compile_handler.py b/test/pytest/test_data/torch_compile/compile_handler.py
new file mode 100644
index 0000000000..745552f671
--- /dev/null
+++ b/test/pytest/test_data/torch_compile/compile_handler.py
@@ -0,0 +1,20 @@
+import torch
+
+from ts.torch_handler.base_handler import BaseHandler
+
+
+class CompileHandler(BaseHandler):
+    def __init__(self):
+        super().__init__()
+
+    def initialize(self, context):
+        super().initialize(context)
+
+    def preprocess(self, data):
+        instances = data[0]["body"]["instances"]
+        input_tensor = torch.as_tensor(instances, dtype=torch.float32)
+        return input_tensor
+
+    def postprocess(self, data):
+        # Convert the output tensor to a list and return
+        return data.tolist()[2]
diff --git a/test/pytest/test_data/torch_compile/config.properties b/test/pytest/test_data/torch_compile/config.properties
new file mode 100644
index 0000000000..8b34c47528
--- /dev/null
+++ b/test/pytest/test_data/torch_compile/config.properties
@@ -0,0 +1,9 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=half_plus_two.mar
+min_workers=1
+max_workers=1
+default_workers_per_model=1
+service_envelope=json
diff --git a/test/pytest/test_data/torch_compile/model.py b/test/pytest/test_data/torch_compile/model.py
new file mode 100644
index 0000000000..b02c8fe106
--- /dev/null
+++ b/test/pytest/test_data/torch_compile/model.py
@@ -0,0 +1,13 @@
+import torch
+
+
+class HalfPlusTwoModel(torch.nn.Module):
+    def forward(self, *input_args):
+        w = torch.tensor(0.5)
+        b = torch.tensor(2.0)
+        return torch.add(torch.multiply(w, input_args[0]), b)
+
+
+if __name__ == "__main__":
+    model = HalfPlusTwoModel()
+    torch.save(model.state_dict(), "model.pt")
diff --git a/test/pytest/test_data/torch_compile/pt2.yaml b/test/pytest/test_data/torch_compile/pt2.yaml
new file mode 100644
index 0000000000..d621c5aac8
--- /dev/null
+++ b/test/pytest/test_data/torch_compile/pt2.yaml
@@ -0,0 +1 @@
+pt2 : "inductor"
diff --git a/test/pytest/test_data/torch_compile/xla.yaml b/test/pytest/test_data/torch_compile/xla.yaml
new file mode 100644
index 0000000000..83ebeba7b7
--- /dev/null
+++ b/test/pytest/test_data/torch_compile/xla.yaml
@@ -0,0 +1 @@
+pt2 : "torchxla_trace_once"
diff --git a/test/pytest/test_distributed_inference_handler.py b/test/pytest/test_distributed_inference_handler.py
new file mode 100644
index 0000000000..b5ab178e5d
--- /dev/null
+++ b/test/pytest/test_distributed_inference_handler.py
@@ -0,0 +1,47 @@
+import os
+import sys
+
+import pytest
+import test_utils
+
+REPO_ROOT = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
+)
+sys.path.append(REPO_ROOT)
+
+TEST_DIR = os.path.join(REPO_ROOT, "test")
+MODEL_STORE_DIR = os.path.join("model_store")
+snapshot_file = os.path.join(TEST_DIR, "config_ts.properties")
+
+POSTMAN_LARGE_MODEL_INFERENCE_DATA_FILE = os.path.join(
+    "postman", "large_model_inference_data.json"
+)
+TS_CONSOLE_LOG_FILE = os.path.join("ts_log.log")
+POSTMAN_ENV_FILE = os.path.join("postman", "environment.json")
+POSTMAN_COLLECTION_INFERENCE = os.path.join(
+    "postman", "inference_api_test_collection.json"
+)
+ARTIFACTS_INFERENCE_DIR = os.path.join("artifacts", "inference")
+REPORT_FILE = os.path.join("report.html")
+
+
+@pytest.mark.skip(
+    reason="Distributed inference requires multi-gpu machine, skipping for now"
+)
+def test_large_model_inference():
+    """Run a Newman test for distributed inference on a large model"""
+    os.chdir(TEST_DIR)
+
+    test_utils.start_torchserve(
+        model_store=MODEL_STORE_DIR, snapshot_file=snapshot_file, gen_mar=False
+    )
+
+    try:
+        command = f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_INFERENCE} -d {POSTMAN_LARGE_MODEL_INFERENCE_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INFERENCE_DIR}/{REPORT_FILE} --verbose"
+        result = os.system(command)
+        assert (
+            result == 0
+        ), "Error: Distributed inference failed, the exit code is not zero"
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.cleanup_model_store(model_store=MODEL_STORE_DIR)
diff --git a/test/pytest/test_example_intel_extension_for_pytorch.py b/test/pytest/test_example_intel_extension_for_pytorch.py
index 9b79eb3ae5..5ff882a800 100644
--- a/test/pytest/test_example_intel_extension_for_pytorch.py
+++ b/test/pytest/test_example_intel_extension_for_pytorch.py
@@ -5,6 +5,7 @@
 import pytest
 import requests
 import test_utils
+import torch
 from test_handler import run_inference_using_url_with_data
 
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
@@ -15,11 +16,19 @@
 MANAGEMENT_API = "http://localhost:8081"
 INFERENCE_API = "http://localhost:8080"
 
-ipex_launcher_available = False
-cmd = ["python", "-m", "intel_extension_for_pytorch.cpu.launch", "--no_python", "ls"]
+xeon_run_cpu_available = False
+cmd = ["python", "-m", "torch.backends.xeon.run_cpu", "--no_python", "ls"]
 r = subprocess.run(cmd)
 if r.returncode == 0:
-    ipex_launcher_available = True
+    xeon_run_cpu_available = True
+
+ipex_available = False
+cmd = ["python", "-c", "import intel_extension_for_pytorch as ipex"]
+r = subprocess.run(cmd)
+if r.returncode == 0:
+    ipex_available = True
+
+ipex_xeon_run_cpu_available = xeon_run_cpu_available and ipex_available
 
 
 def setup_module():
@@ -39,10 +48,10 @@ def setup_torchserve():
 
 
 def get_worker_affinity(num_workers, worker_idx):
-    from intel_extension_for_pytorch.cpu.launch import CPUinfo
+    from torch.backends.xeon.run_cpu import _CPUinfo
 
-    cpuinfo = CPUinfo()
-    num_cores = cpuinfo.physical_core_nums()
+    cpuinfo = _CPUinfo()
+    num_cores = cpuinfo._physical_core_nums()
 
     num_cores_per_worker = num_cores // num_workers
     start = worker_idx * num_cores_per_worker
@@ -75,8 +84,9 @@ def scale_workers_with_core_pinning(scaled_num_workers):
 
 
 @pytest.mark.skipif(
-    not ipex_launcher_available,
-    reason="Make sure intel-extension-for-pytorch is installed",
+    not ipex_xeon_run_cpu_available
+    or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
 def test_single_worker_affinity():
     num_workers = 1
@@ -98,11 +108,12 @@ def test_single_worker_affinity():
 
 
 @pytest.mark.skipif(
-    not ipex_launcher_available,
-    reason="Make sure intel-extension-for-pytorch is installed",
+    not ipex_xeon_run_cpu_available
+    or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
 def test_multi_worker_affinity():
-    num_workers = 4
+    num_workers = 2
     setup_torchserve()
     requests.post(
         "http://localhost:8081/models?initial_workers={}&synchronous=true&url=resnet-18.mar".format(
@@ -123,11 +134,12 @@ def test_multi_worker_affinity():
 
 
 @pytest.mark.skipif(
-    not ipex_launcher_available,
-    reason="Make sure intel-extension-for-pytorch is installed",
+    not ipex_xeon_run_cpu_available
+    or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
 def test_worker_scale_up_affinity():
-    initial_num_workers = 2
+    initial_num_workers = 1
     setup_torchserve()
     requests.post(
         "http://localhost:8081/models?initial_workers={}&synchronous=true&url=resnet-18.mar".format(
@@ -135,7 +147,7 @@ def test_worker_scale_up_affinity():
         )
     )
 
-    scaled_up_num_workers = 4
+    scaled_up_num_workers = 2
     response = scale_workers_with_core_pinning(scaled_up_num_workers)
     resnet18_list = json.loads(response.content)
     assert (
@@ -155,11 +167,12 @@ def test_worker_scale_up_affinity():
 
 
 @pytest.mark.skipif(
-    not ipex_launcher_available,
-    reason="Make sure intel-extension-for-pytorch is installed",
+    not ipex_xeon_run_cpu_available
+    or ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Make sure intel-extension-for-pytorch is installed and torch.backends.xeon.run_cpu is available",
 )
 def test_worker_scale_down_affinity():
-    initial_num_workers = 4
+    initial_num_workers = 2
     setup_torchserve()
     requests.post(
         "http://localhost:8081/models?initial_workers={}&synchronous=true&url=resnet-18.mar".format(
@@ -167,7 +180,7 @@ def test_worker_scale_down_affinity():
         )
     )
 
-    scaled_down_num_workers = 2
+    scaled_down_num_workers = 1
     response = scale_workers_with_core_pinning(scaled_down_num_workers)
     resnet18_list = json.loads(response.content)
     assert (
diff --git a/test/pytest/test_example_micro_batching.py b/test/pytest/test_example_micro_batching.py
new file mode 100644
index 0000000000..60a3c9c3d0
--- /dev/null
+++ b/test/pytest/test_example_micro_batching.py
@@ -0,0 +1,227 @@
+import asyncio
+import json
+import random
+import shutil
+from argparse import Namespace
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+from zipfile import ZIP_STORED, ZipFile
+
+import pytest
+import requests
+import test_utils
+import yaml
+from torchvision.models.resnet import ResNet18_Weights
+
+from ts.torch_handler.unit_tests.test_utils.model_dir import download_model
+
+CURR_FILE_PATH = Path(__file__).parent
+REPO_ROOT_DIR = CURR_FILE_PATH.parent.parent
+
+EXAMPLE_ROOT_DIR = REPO_ROOT_DIR.joinpath("examples", "microbatching")
+
+
+def read_image_bytes(filename):
+    with open(
+        filename,
+        "rb",
+    ) as fin:
+        image_bytes = fin.read()
+    return image_bytes
+
+
+@pytest.fixture(scope="module")
+def kitten_image_bytes():
+    return read_image_bytes(
+        REPO_ROOT_DIR.joinpath(
+            "examples/image_classifier/resnet_152_batch/images/kitten.jpg"
+        ).as_posix()
+    )
+
+
+@pytest.fixture(scope="module")
+def dog_image_bytes():
+    return read_image_bytes(
+        REPO_ROOT_DIR.joinpath(
+            "examples/image_classifier/resnet_152_batch/images/dog.jpg"
+        ).as_posix()
+    )
+
+
+@pytest.fixture(scope="module", params=[4, 16])
+def mixed_batch(kitten_image_bytes, dog_image_bytes, request):
+    batch_size = request.param
+    labels = [
+        "tiger_cat" if random.random() > 0.5 else "golden_retriever"
+        for _ in range(batch_size)
+    ]
+    test_data = []
+    for l in labels:
+        test_data.append(kitten_image_bytes if l == "tiger_cat" else dog_image_bytes)
+    return test_data, labels
+
+
+@pytest.fixture(scope="module")
+def model_name():
+    yield "image_classifier"
+
+
+@pytest.fixture(scope="module")
+def work_dir(tmp_path_factory, model_name):
+    return tmp_path_factory.mktemp(model_name)
+
+
+@pytest.fixture(scope="module")
+def serialized_file(work_dir):
+    model_url = ResNet18_Weights.DEFAULT.url
+
+    download_model(model_url, work_dir)
+
+    yield Path(work_dir) / "model.pt"
+
+
+@pytest.fixture(
+    scope="module", name="mar_file_path", params=["yaml_config", "no_config"]
+)
+def create_mar_file(work_dir, serialized_file, model_archiver, model_name, request):
+    mar_file_path = Path(work_dir).joinpath(model_name + ".mar")
+
+    name_file = REPO_ROOT_DIR.joinpath(
+        "examples/image_classifier/resnet_18/index_to_name.json"
+    ).as_posix()
+
+    config_file = None
+    if request.param == "yaml_config":
+        micro_batching_params = {
+            "micro_batching": {
+                "micro_batch_size": 2,
+                "parallelism": {
+                    "preprocess": 2,
+                    "inference": 2,
+                    "postprocess": 2,
+                },
+            },
+        }
+
+        config_file = Path(work_dir).joinpath("model_config.yaml")
+
+        with open(config_file, "w") as f:
+            yaml.dump(micro_batching_params, f)
+        config_file = REPO_ROOT_DIR.joinpath(
+            "examples", "micro_batching", "config.yaml"
+        )
+
+    extra_files = [name_file]
+
+    args = Namespace(
+        model_name=model_name,
+        version="1.0",
+        serialized_file=str(serialized_file),
+        model_file=REPO_ROOT_DIR.joinpath(
+            "examples", "image_classifier", "resnet_18", "model.py"
+        ).as_posix(),
+        handler=REPO_ROOT_DIR.joinpath(
+            "examples", "micro_batching", "micro_batching_handler.py"
+        ).as_posix(),
+        extra_files=",".join(extra_files),
+        export_path=work_dir,
+        requirements_file=None,
+        runtime="python",
+        force=False,
+        archive_format="default",
+        config_file=config_file,
+    )
+
+    mock = MagicMock()
+    mock.parse_args = MagicMock(return_value=args)
+    with patch("archiver.ArgParser.export_model_args_parser", return_value=mock):
+        # Using ZIP_STORED instead of ZIP_DEFLATED reduces test runtime from 54 secs to 10 secs
+        with patch(
+            "model_archiver.model_packaging_utils.zipfile.ZipFile",
+            lambda x, y, _: ZipFile(x, y, ZIP_STORED),
+        ):
+            model_archiver.generate_model_archive()
+
+            assert mar_file_path.exists()
+
+            yield mar_file_path.as_posix()
+
+    # Clean up files
+    mar_file_path.unlink(missing_ok=True)
+
+
+@pytest.fixture(scope="module", name="model_name")
+def register_model(mar_file_path, model_store, torchserve):
+    """
+    Register the model in torchserve
+    """
+    shutil.copy(mar_file_path, model_store)
+
+    file_name = Path(mar_file_path).name
+
+    model_name = Path(file_name).stem
+
+    params = (
+        ("model_name", model_name),
+        ("url", file_name),
+        ("initial_workers", "1"),
+        ("synchronous", "true"),
+        ("batch_size", "32"),
+    )
+
+    test_utils.reg_resp = test_utils.register_model_with_params(params)
+
+    yield model_name
+
+    test_utils.unregister_model(model_name)
+
+
+def test_single_example_inference(model_name, kitten_image_bytes):
+    """
+    Full circle test with torchserve
+    """
+
+    response = requests.post(
+        url=f"http://localhost:8080/predictions/{model_name}", data=kitten_image_bytes
+    )
+
+    import inspect
+
+    print(inspect.getmembers(response))
+
+    assert response.status_code == 200
+
+
+async def issue_request(model_name, data):
+    return requests.post(
+        url=f"http://localhost:8080/predictions/{model_name}", data=data
+    )
+
+
+async def issue_multi_requests(model_name, data):
+    tasks = []
+    for d in data:
+        tasks.append(asyncio.create_task(issue_request(model_name, d)))
+
+    ret = []
+    for t in tasks:
+        ret.append(await t)
+
+    return ret
+
+
+def test_multi_example_inference(model_name, mixed_batch):
+    """
+    Full circle test with torchserve
+    """
+    test_data, labels = mixed_batch
+
+    responses = asyncio.run(issue_multi_requests(model_name, test_data))
+
+    status_codes = [r.status_code for r in responses]
+
+    assert status_codes == [200] * len(status_codes)
+
+    result_entries = [json.loads(r.text) for r in responses]
+
+    assert all(l in r.keys() for l, r in zip(labels, result_entries))
diff --git a/test/pytest/test_example_near_real_time_video.py b/test/pytest/test_example_near_real_time_video.py
index 7b48c3147b..18548a7e0b 100644
--- a/test/pytest/test_example_near_real_time_video.py
+++ b/test/pytest/test_example_near_real_time_video.py
@@ -64,6 +64,7 @@ def create_mar_file(work_dir, session_mocker, model_archiver):
         runtime="python",
         force=False,
         archive_format="default",
+        config_file=None,
     )
 
     mock = session_mocker.MagicMock()
diff --git a/test/pytest/test_example_scriptable_tokenzier.py b/test/pytest/test_example_scriptable_tokenzier.py
index c16b4cf364..ca1909edc6 100644
--- a/test/pytest/test_example_scriptable_tokenzier.py
+++ b/test/pytest/test_example_scriptable_tokenzier.py
@@ -160,6 +160,7 @@ def create_mar_file(work_dir, session_mocker, jit_file_path, model_archiver):
         runtime="python",
         force=False,
         archive_format="default",
+        config_file=None,
     )
 
     mock = session_mocker.MagicMock()
@@ -215,7 +216,6 @@ def test_handler(monkeypatch, mocker, jit_file_path, test_file):
 
     # We need to recreate the handler to avoid running into https://github.com/pytorch/text/issues/1849
     def create_and_call_handler(input_text):
-
         from handler import CustomTextClassifier
 
         handler = CustomTextClassifier()
@@ -250,7 +250,6 @@ def create_and_call_handler(input_text):
 
 
 def test_inference_with_untrained_model_and_sample_text(model_name, test_file):
-
     with open(test_file, "rb") as f:
         response = requests.post(
             url=f"http://localhost:8080/predictions/{model_name}", data=f
@@ -269,7 +268,6 @@ def test_inference_with_untrained_model_and_sample_text(model_name, test_file):
 
 
 def test_inference_with_untrained_model_and_empty_string(model_name):
-
     data = "".encode("utf8")
 
     response = requests.post(
diff --git a/test/pytest/test_example_torchrec_dlrm.py b/test/pytest/test_example_torchrec_dlrm.py
index c0808fc5d7..e4fef7e240 100644
--- a/test/pytest/test_example_torchrec_dlrm.py
+++ b/test/pytest/test_example_torchrec_dlrm.py
@@ -104,6 +104,7 @@ def create_mar_file(work_dir, session_mocker, serialized_file, model_archiver):
         runtime="python",
         force=False,
         archive_format="default",
+        config_file=None,
     )
 
     mock = session_mocker.MagicMock()
diff --git a/test/pytest/test_gRPC_inference_api.py b/test/pytest/test_gRPC_inference_api.py
index 01d60fe814..3ae6bc0a55 100644
--- a/test/pytest/test_gRPC_inference_api.py
+++ b/test/pytest/test_gRPC_inference_api.py
@@ -1,13 +1,14 @@
+import json
+import os
 from ast import literal_eval
+
 import inference_pb2
-import json
 import management_pb2
-import os
 import test_gRPC_utils
 import test_utils
 
-
 inference_data_json = "../postman/inference_data.json"
+inference_stream_data_json = "../postman/inference_stream_data.json"
 
 
 def setup_module(module):
@@ -25,66 +26,138 @@ def __get_change(current, previous):
     try:
         return (abs(current - previous) / previous) * 100.0
     except ZeroDivisionError:
-        return float('inf')
+        return float("inf")
 
 
 def __infer(stub, model_name, model_input):
-    with open(model_input, 'rb') as f:
+    with open(model_input, "rb") as f:
         data = f.read()
 
-    input_data = {'data': data}
+    input_data = {"data": data}
     response = stub.Predictions(
-            inference_pb2.PredictionsRequest(model_name=model_name, input=input_data))
+        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data)
+    )
 
-    prediction = response.prediction.decode('utf-8')
+    prediction = response.prediction.decode("utf-8")
 
     return prediction
 
 
 def test_inference_apis():
-    with open(os.path.join(os.path.dirname(__file__), inference_data_json), 'rb') as f:
+    with open(os.path.join(os.path.dirname(__file__), inference_data_json), "rb") as f:
         test_data = json.loads(f.read())
 
     for item in test_data:
-        if item['url'].startswith('{{mar_path_'):
-            path = test_utils.mar_file_table[item['url'][2:-2]]
+        if item["url"].startswith("{{mar_path_"):
+            path = test_utils.mar_file_table[item["url"][2:-2]]
         else:
-            path = item['url']
+            path = item["url"]
 
         managment_stub = test_gRPC_utils.get_management_stub()
-        response = managment_stub.RegisterModel(management_pb2.RegisterModelRequest(
-            url=path,
-            initial_workers=item['worker'],
-            synchronous=bool(item['synchronous']),
-            model_name=item['model_name']
-        ))
+        response = managment_stub.RegisterModel(
+            management_pb2.RegisterModelRequest(
+                url=path,
+                initial_workers=item["worker"],
+                synchronous=bool(item["synchronous"]),
+                model_name=item["model_name"],
+            )
+        )
 
         print(response.msg)
 
-        model_input = os.path.join(os.path.dirname(__file__), "..", item['file'])
-        prediction = __infer(test_gRPC_utils.get_inference_stub(), item['model_name'], model_input)
+        model_input = os.path.join(os.path.dirname(__file__), "..", item["file"])
+        prediction = __infer(
+            test_gRPC_utils.get_inference_stub(), item["model_name"], model_input
+        )
 
         print("Prediction is : ", str(prediction))
 
-        if 'expected' in item:
+        if "expected" in item:
             try:
                 prediction = literal_eval(prediction)
             except SyntaxError:
                 pass
 
-            if isinstance(prediction, list) and 'tolerance' in item:
-                assert len(prediction) == len(item['expected'])
+            if isinstance(prediction, list) and "tolerance" in item:
+                assert len(prediction) == len(item["expected"])
                 for i in range(len(prediction)):
-                    assert __get_change(prediction[i], item['expected'][i]) < item['tolerance']
-            elif isinstance(prediction, dict) and 'tolerance' in item:
-                assert len(prediction) == len(item['expected'])
+                    assert (
+                        __get_change(prediction[i], item["expected"][i])
+                        < item["tolerance"]
+                    )
+            elif isinstance(prediction, dict) and "tolerance" in item:
+                assert len(prediction) == len(item["expected"])
                 for key in prediction:
-                    assert __get_change(prediction[key], item['expected'][key]) < item['tolerance']
+                    assert (
+                        __get_change(prediction[key], item["expected"][key])
+                        < item["tolerance"]
+                    )
             else:
-                assert str(prediction) == str(item['expected'])
+                assert str(prediction) == str(item["expected"])
+
+        response = managment_stub.UnregisterModel(
+            management_pb2.UnregisterModelRequest(
+                model_name=item["model_name"],
+            )
+        )
+
+        print(response.msg)
+
+
+def __infer_stream(stub, model_name, model_input):
+    with open(model_input, "rb") as f:
+        data = f.read()
+
+    input_data = {"data": data}
+    responses = stub.StreamPredictions(
+        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data)
+    )
+
+    prediction = []
+    for resp in responses:
+        prediction.append(resp.prediction.decode("utf-8"))
+
+    return " ".join(prediction)
+
+
+def test_inference_stream_apis():
+    with open(
+        os.path.join(os.path.dirname(__file__), inference_stream_data_json), "rb"
+    ) as f:
+        test_data = json.loads(f.read())
+
+    for item in test_data:
+        if item["url"].startswith("{{mar_path_"):
+            path = test_utils.mar_file_table[item["url"][2:-2]]
+        else:
+            path = item["url"]
+
+        managment_stub = test_gRPC_utils.get_management_stub()
+        response = managment_stub.RegisterModel(
+            management_pb2.RegisterModelRequest(
+                url=path,
+                initial_workers=item["worker"],
+                synchronous=bool(item["synchronous"]),
+                model_name=item["model_name"],
+            )
+        )
+
+        print(response.msg)
+
+        model_input = os.path.join(os.path.dirname(__file__), "..", item["file"])
+        prediction = __infer_stream(
+            test_gRPC_utils.get_inference_stub(), item["model_name"], model_input
+        )
+
+        print("Stream prediction is : ", str(prediction))
+
+        if "expected" in item:
+            assert str(prediction) == str(item["expected"])
 
-        response = managment_stub.UnregisterModel(management_pb2.UnregisterModelRequest(
-            model_name=item['model_name'],
-        ))
+        response = managment_stub.UnregisterModel(
+            management_pb2.UnregisterModelRequest(
+                model_name=item["model_name"],
+            )
+        )
 
         print(response.msg)
diff --git a/test/pytest/test_handler.py b/test/pytest/test_handler.py
index 9371e62814..5f46ba2275 100644
--- a/test/pytest/test_handler.py
+++ b/test/pytest/test_handler.py
@@ -1,46 +1,60 @@
-import os
-import requests
+import ast
 import json
 import logging
-import test_utils
+import os
+
 import numpy as np
-import ast 
 import pytest
+import requests
+import test_utils
 import torch
-REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../"))
+
+REPO_ROOT = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
+)
 snapshot_file_kf = os.path.join(REPO_ROOT, "test", "config_kf.properties")
-snapshot_file_tf = os.path.join(REPO_ROOT,"test", "config_ts.properties")
-data_file_mnist = os.path.join(REPO_ROOT, "examples", "image_classifier", "mnist", "test_data", "1.png")
-input_json_mnist = os.path.join(REPO_ROOT, "kubernetes", "kserve", "kf_request_json", "v1", "mnist.json")
-input_json_mmf = os.path.join(REPO_ROOT, "examples", "MMF-activity-recognition", "372CC.info.json")
+snapshot_file_tf = os.path.join(REPO_ROOT, "test", "config_ts.properties")
+data_file_mnist = os.path.join(
+    REPO_ROOT, "examples", "image_classifier", "mnist", "test_data", "1.png"
+)
+input_json_mnist = os.path.join(
+    REPO_ROOT, "kubernetes", "kserve", "kf_request_json", "v1", "mnist.json"
+)
+input_json_mmf = os.path.join(
+    REPO_ROOT, "examples", "MMF-activity-recognition", "372CC.info.json"
+)
 logger = logging.getLogger(__name__)
 
 
 def getAPIS(snapshot_file):
     MANAGEMENT_API = "http://127.0.0.1:8081"
     INFERENCE_API = "http://127.0.0.1:8080"
-    
+
     with open(snapshot_file, "r") as fp:
         lines = fp.readlines()
     for line in lines:
-        line = line.rstrip('\n')
+        line = line.rstrip("\n")
         if "management_address" in line:
             MANAGEMENT_API = line.split("=")[1]
         if "inference_address" in line:
             INFERENCE_API = line.split("=")[1]
-    
+
     return (MANAGEMENT_API, INFERENCE_API)
 
 
 KF_MANAGEMENT_API, KF_INFERENCE_API = getAPIS(snapshot_file_kf)
 TF_MANAGEMENT_API, TF_INFERENCE_API = getAPIS(snapshot_file_tf)
 
+
 def setup_module(module):
     test_utils.torchserve_cleanup()
-    response = requests.get("https://torchserve.pytorch.org/mar_files/mnist.mar", allow_redirects=True)
-    with open(os.path.join(test_utils.MODEL_STORE, "mnist.mar"), 'wb') as f:
+    response = requests.get(
+        "https://torchserve.pytorch.org/mar_files/mnist.mar", allow_redirects=True
+    )
+    with open(os.path.join(test_utils.MODEL_STORE, "mnist.mar"), "wb") as f:
         f.write(response.content)
 
+
 def teardown_module(module):
     test_utils.torchserve_cleanup()
 
@@ -50,27 +64,33 @@ def mnist_model_register_using_non_existent_handler_then_scale_up(synchronous=Fa
     Validates that snapshot.cfg is created when management apis are invoked.
     """
     response = requests.post(
-        TF_MANAGEMENT_API + '/models?handler=nehandler&url=mnist.mar')
+        TF_MANAGEMENT_API + "/models?handler=nehandler&url=mnist.mar"
+    )
 
     # Scale up workers
     if synchronous:
-        params = (('min_worker', '2'), ('synchronous', 'True'),)
+        params = (
+            ("min_worker", "2"),
+            ("synchronous", "True"),
+        )
     else:
-        params = (('min_worker', '2'),)
+        params = (("min_worker", "2"),)
 
-    response = requests.put(TF_MANAGEMENT_API + '/models/mnist', params=params)
+    response = requests.put(TF_MANAGEMENT_API + "/models/mnist", params=params)
     # Check if workers got scaled
-    response = requests.get(TF_MANAGEMENT_API + '/models/mnist')
+    response = requests.get(TF_MANAGEMENT_API + "/models/mnist")
     return response
 
 
 def mnist_model_register_and_scale_using_non_existent_handler_synchronous():
     # Register & Scale model
-    response = mnist_model_register_using_non_existent_handler_then_scale_up(synchronous=True)
+    response = mnist_model_register_using_non_existent_handler_then_scale_up(
+        synchronous=True
+    )
     mnist_list = json.loads(response.content)
     try:
         # Workers should not scale up
-        assert len(mnist_list[0]['workers']) == 0
+        assert len(mnist_list[0]["workers"]) == 0
     finally:
         # UnRegister mnist model
         test_utils.unregister_model("mnist")
@@ -82,22 +102,29 @@ def mnist_model_register_and_scale_using_non_existent_handler_asynchronous():
     mnist_list = json.loads(response.content)
     try:
         # Workers should not scale up
-        assert len(mnist_list[0]['workers']) == 0
+        assert len(mnist_list[0]["workers"]) == 0
     finally:
         # UnRegister mnist model
         test_utils.unregister_model("mnist")
 
 
 def run_inference_using_url_with_data(purl=None, pfiles=None, ptimeout=120):
+    print(f"purl={purl}")
+    print(f"pfiles={pfiles}")
     if purl is None and pfiles is None:
         return
+    print(f"purl1={purl}")
+    print(f"pfiles1={pfiles}")
     try:
         response = requests.post(url=purl, files=pfiles, timeout=ptimeout)
     except:
+        print(f"sent echo_stream rep=none")
         return None
     else:
+        print(f"sent echo_stream rep={response}")
         return response
 
+
 def run_inference_using_url_with_data_json(purl=None, json_input=None, ptimeout=120):
     if purl is None and pfiles is None:
         return
@@ -114,12 +141,13 @@ def test_mnist_model_register_and_inference_on_valid_model():
     Validates that snapshot.cfg is created when management apis are invoked.
     """
     test_utils.start_torchserve(no_config_snapshots=True)
-    test_utils.register_model('mnist', 'mnist.mar')
+    test_utils.register_model("mnist", "mnist.mar")
     files = {
-        'data': (data_file_mnist,
-                 open(data_file_mnist, 'rb')),
+        "data": (data_file_mnist, open(data_file_mnist, "rb")),
     }
-    response = run_inference_using_url_with_data(TF_INFERENCE_API + '/predictions/mnist', files)
+    response = run_inference_using_url_with_data(
+        TF_INFERENCE_API + "/predictions/mnist", files
+    )
 
     assert (json.loads(response.content)) == 1
     test_utils.unregister_model("mnist")
@@ -132,14 +160,22 @@ def test_mnist_model_register_using_non_existent_handler_with_nonzero_workers():
     """
 
     response = requests.post(
-        TF_MANAGEMENT_API + '/models?handler=nehandlermodels&initial_workers=1&url=mnist.mar')
-    if json.loads(response.content)['code'] == 500 and \
-            json.loads(response.content)['type'] == "InternalServerException":
-        assert True, "Internal Server Exception, " \
-                     "Cannot register model with non existent handler with non zero workers"
+        TF_MANAGEMENT_API
+        + "/models?handler=nehandlermodels&initial_workers=1&url=mnist.mar"
+    )
+    if (
+        json.loads(response.content)["code"] == 500
+        and json.loads(response.content)["type"] == "InternalServerException"
+    ):
+        assert True, (
+            "Internal Server Exception, "
+            "Cannot register model with non existent handler with non zero workers"
+        )
     else:
-        assert False, "Something is not right!! Successfully registered model with " \
-                      "non existent handler with non zero workers"
+        assert False, (
+            "Something is not right!! Successfully registered model with "
+            "non existent handler with non zero workers"
+        )
 
     test_utils.unregister_model("mnist")
 
@@ -147,19 +183,22 @@ def test_mnist_model_register_using_non_existent_handler_with_nonzero_workers():
 def test_mnist_model_register_scale_inference_with_non_existent_handler():
     response = mnist_model_register_using_non_existent_handler_then_scale_up()
     mnist_list = json.loads(response.content)
-    assert len(mnist_list[0]['workers']) > 1
+    assert len(mnist_list[0]["workers"]) > 1
     files = {
-        'data': (data_file_mnist,
-                 open(data_file_mnist, 'rb')),
+        "data": (data_file_mnist, open(data_file_mnist, "rb")),
     }
 
-    response = run_inference_using_url_with_data(TF_INFERENCE_API + '/predictions/mnist', files)
+    response = run_inference_using_url_with_data(
+        TF_INFERENCE_API + "/predictions/mnist", files
+    )
     if response is None:
         assert True, "Inference failed as the handler is non existent"
     else:
         if json.loads(response.content) == 1:
-            assert False, "Something is not right!! Somehow Inference passed " \
-                          "despite passing non existent handler"
+            assert False, (
+                "Something is not right!! Somehow Inference passed "
+                "despite passing non existent handler"
+            )
 
 
 def test_mnist_model_register_and_inference_on_valid_model_explain():
@@ -167,12 +206,13 @@ def test_mnist_model_register_and_inference_on_valid_model_explain():
     Validates that snapshot.cfg is created when management apis are invoked.
     """
     test_utils.start_torchserve(no_config_snapshots=True)
-    test_utils.register_model('mnist', 'mnist.mar')
+    test_utils.register_model("mnist", "mnist.mar")
     files = {
-        'data': (data_file_mnist,
-                 open(data_file_mnist, 'rb')),
+        "data": (data_file_mnist, open(data_file_mnist, "rb")),
     }
-    response = run_inference_using_url_with_data(TF_INFERENCE_API + '/explanations/mnist', files)
+    response = run_inference_using_url_with_data(
+        TF_INFERENCE_API + "/explanations/mnist", files
+    )
 
     assert np.array(json.loads(response.content)).shape == (1, 28, 28)
     test_utils.unregister_model("mnist")
@@ -182,124 +222,194 @@ def test_kserve_mnist_model_register_and_inference_on_valid_model():
     """
     Validates that snapshot.cfg is created when management apis are invoked for kserve.
     """
-    test_utils.start_torchserve(snapshot_file = snapshot_file_kf)
-    test_utils.register_model('mnist', 'mnist.mar')
+    test_utils.start_torchserve(snapshot_file=snapshot_file_kf)
+    test_utils.register_model("mnist", "mnist.mar")
 
-    with open(input_json_mnist, 'r') as f:
+    with open(input_json_mnist, "r") as f:
         s = f.read()
-        s = s.replace('\'','\"')
+        s = s.replace("'", '"')
         data = json.loads(s)
 
-    response = run_inference_using_url_with_data_json(KF_INFERENCE_API + '/v1/models/mnist:predict', data)
+    response = run_inference_using_url_with_data_json(
+        KF_INFERENCE_API + "/v1/models/mnist:predict", data
+    )
 
-    assert (json.loads(response.content)['predictions'][0]) == 2
+    assert (json.loads(response.content)["predictions"][0]) == 2
     test_utils.unregister_model("mnist")
 
 
-def test_kserve_mnist_model_register_scale_inference_with_non_existent_handler(
-):
+def test_kserve_mnist_model_register_scale_inference_with_non_existent_handler():
     response = mnist_model_register_using_non_existent_handler_then_scale_up()
     mnist_list = json.loads(response.content)
-    assert len(mnist_list[0]['workers']) > 1
-    with open(input_json_mnist, 'r') as f:
+    assert len(mnist_list[0]["workers"]) > 1
+    with open(input_json_mnist, "r") as f:
         s = f.read()
-        s = s.replace('\'','\"')
+        s = s.replace("'", '"')
         data = json.loads(s)
 
-    response = run_inference_using_url_with_data_json(KF_INFERENCE_API + '/v1/models/mnist:predict', data)
+    response = run_inference_using_url_with_data_json(
+        KF_INFERENCE_API + "/v1/models/mnist:predict", data
+    )
 
     if response is None:
         assert True, "Inference failed as the handler is non existent"
     else:
         if json.loads(response.content) == 1:
-            assert False, "Something is not right!! Somehow Inference passed " \
-                          "despite passing non existent handler"
+            assert False, (
+                "Something is not right!! Somehow Inference passed "
+                "despite passing non existent handler"
+            )
 
 
 def test_kserve_mnist_model_register_and_inference_on_valid_model_explain():
     """
     Validates the kserve model explanations.
     """
-    test_utils.start_torchserve(snapshot_file = snapshot_file_kf)
-    test_utils.register_model('mnist', 'mnist.mar')
-    with open(input_json_mnist, 'r') as f:
+    test_utils.start_torchserve(snapshot_file=snapshot_file_kf)
+    test_utils.register_model("mnist", "mnist.mar")
+    with open(input_json_mnist, "r") as f:
         s = f.read()
-        s = s.replace('\'','\"')
+        s = s.replace("'", '"')
         data = json.loads(s)
 
-    response = run_inference_using_url_with_data_json(KF_INFERENCE_API + '/v1/models/mnist:explain', data)
+    response = run_inference_using_url_with_data_json(
+        KF_INFERENCE_API + "/v1/models/mnist:explain", data
+    )
 
-    assert np.array(json.loads(response.content)['explanations']).shape == (1, 1, 28, 28)
+    assert np.array(json.loads(response.content)["explanations"]).shape == (
+        1,
+        1,
+        28,
+        28,
+    )
     test_utils.unregister_model("mnist")
 
+
 def test_huggingface_bert_batch_inference():
     batch_size = 2
-    batch_delay = 10000 # 10 seconds
+    batch_delay = 10000  # 10 seconds
     params = (
-        ('model_name', 'BERTSeqClassification'),
-        ('url', 'https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar'),
-        ('initial_workers', '1'),
-        ('batch_size', str(batch_size)),
-        ('max_batch_delay', str(batch_delay))
+        ("model_name", "BERTSeqClassification"),
+        ("url", "https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar"),
+        ("initial_workers", "1"),
+        ("batch_size", str(batch_size)),
+        ("max_batch_delay", str(batch_delay)),
     )
     test_utils.start_torchserve(no_config_snapshots=True)
     test_utils.register_model_with_params(params)
-    input_text = os.path.join(REPO_ROOT, 'examples', 'Huggingface_Transformers', 'Seq_classification_artifacts', 'sample_text.txt')
+    input_text = os.path.join(
+        REPO_ROOT,
+        "examples",
+        "Huggingface_Transformers",
+        "Seq_classification_artifacts",
+        "sample_text.txt",
+    )
 
     # Make 2 curl requests in parallel with &
     # curl --header \"X-Forwarded-For: 1.2.3.4\" won't work since you can't access local host anymore
-    response = os.popen(f"curl http://127.0.0.1:8080/predictions/BERTSeqClassification -T {input_text} & curl http://127.0.0.1:8080/predictions/BERTSeqClassification -T {input_text}")
+    response = os.popen(
+        f"curl http://127.0.0.1:8080/predictions/BERTSeqClassification -T {input_text} & curl http://127.0.0.1:8080/predictions/BERTSeqClassification -T {input_text}"
+    )
     response = response.read()
 
-
     ## Assert that 2 responses are returned from the same batch
-    assert response == 'Not AcceptedNot Accepted'
-    test_utils.unregister_model('BERTSeqClassification')
+    assert response == "Not AcceptedNot Accepted"
+    test_utils.unregister_model("BERTSeqClassification")
+
 
 @pytest.mark.skip(reason="MMF doesn't support PT 1.10 yet")
 def test_MMF_activity_recognition_model_register_and_inference_on_valid_model():
-
-    test_utils.start_torchserve(snapshot_file = snapshot_file_tf)
-    test_utils.register_model('MMF_activity_recognition_v2', 'https://torchserve.pytorch.org/mar_files/MMF_activity_recognition_v2.mar')
-    os.system('wget https://mmfartifacts.s3-us-west-2.amazonaws.com/372CC.mp4 -P ../../examples/MMF-activity-recognition')
+    test_utils.start_torchserve(snapshot_file=snapshot_file_tf)
+    test_utils.register_model(
+        "MMF_activity_recognition_v2",
+        "https://torchserve.pytorch.org/mar_files/MMF_activity_recognition_v2.mar",
+    )
+    os.system(
+        "wget https://mmfartifacts.s3-us-west-2.amazonaws.com/372CC.mp4 -P ../../examples/MMF-activity-recognition"
+    )
     input_json = "../../examples/MMF-activity-recognition/372CC.info.json"
     with open(input_json) as jsonfile:
         info = json.load(jsonfile)
 
     files = {
-            'data': open('../../examples/MMF-activity-recognition/372CC.mp4','rb'),
-            'script': info['script'],
-            'labels':info['action_labels']
-            }
-    response = run_inference_using_url_with_data(TF_INFERENCE_API + '/v1/models/MMF_activity_recognition_v2:predict', pfiles=files)
+        "data": open("../../examples/MMF-activity-recognition/372CC.mp4", "rb"),
+        "script": info["script"],
+        "labels": info["action_labels"],
+    }
+    response = run_inference_using_url_with_data(
+        TF_INFERENCE_API + "/v1/models/MMF_activity_recognition_v2:predict",
+        pfiles=files,
+    )
     response = response.content.decode("utf-8")
     response = ast.literal_eval(response)
     response = [n.strip() for n in response]
-    assert response == ['Sitting at a table','Someone is sneezing','Watching a laptop or something on a laptop']
+    assert response == [
+        "Sitting at a table",
+        "Someone is sneezing",
+        "Watching a laptop or something on a laptop",
+    ]
     test_utils.unregister_model("MMF_activity_recognition_v2")
 
+
 def test_huggingface_bert_model_parallel_inference():
     number_of_gpus = torch.cuda.device_count()
     check = os.popen(f"curl http://localhost:8081/models")
     print(check)
     if number_of_gpus > 1:
         batch_size = 1
-        batch_delay = 5000 # 10 seconds
+        batch_delay = 5000  # 10 seconds
         params = (
-            ('model_name', 'Textgeneration'),
-            ('url', 'https://bert-mar-file.s3.us-west-2.amazonaws.com/Textgeneration.mar'),
-            ('initial_workers', '1'),
-            ('batch_size', str(batch_size)),
-            ('max_batch_delay', str(batch_delay))
+            ("model_name", "Textgeneration"),
+            (
+                "url",
+                "https://bert-mar-file.s3.us-west-2.amazonaws.com/Textgeneration.mar",
+            ),
+            ("initial_workers", "1"),
+            ("batch_size", str(batch_size)),
+            ("max_batch_delay", str(batch_delay)),
         )
         test_utils.start_torchserve(no_config_snapshots=True)
         test_utils.register_model_with_params(params)
-        input_text = os.path.join(REPO_ROOT, 'examples', 'Huggingface_Transformers', 'Text_gen_artifacts', 'sample_text_captum_input.txt')
-        
-        response = os.popen(f"curl http://127.0.0.1:8080/predictions/Textgeneration -T {input_text}")
+        input_text = os.path.join(
+            REPO_ROOT,
+            "examples",
+            "Huggingface_Transformers",
+            "Text_gen_artifacts",
+            "sample_text_captum_input.txt",
+        )
+
+        response = os.popen(
+            f"curl http://127.0.0.1:8080/predictions/Textgeneration -T {input_text}"
+        )
         response = response.read()
-        
-        assert  'Bloomberg has decided to publish a new report on the global economy' in response
-        test_utils.unregister_model('Textgeneration')
+
+        assert (
+            "Bloomberg has decided to publish a new report on the global economy"
+            in response
+        )
+        test_utils.unregister_model("Textgeneration")
     else:
-        logger.info("Running model parallel inference requuires more than one gpu, number of available gpus on thi machine is: ", number_of_gpus)
+        logger.info(
+            "Running model parallel inference requuires more than one gpu, number of available gpus on thi machine is: ",
+            number_of_gpus,
+        )
+
+
+def test_echo_stream_inference():
+    test_utils.start_torchserve(no_config_snapshots=True, gen_mar=False)
+    test_utils.register_model(
+        "echo_stream", "https://torchserve.pytorch.org/mar_files/echo_stream.mar"
+    )
+
+    response = requests.post(
+        TF_INFERENCE_API + "/predictions/echo_stream", data="foo", stream=True
+    )
+    assert response.headers["Transfer-Encoding"] == "chunked"
+
+    prediction = []
+    for chunk in response.iter_content(chunk_size=None):
+        if chunk:
+            prediction.append(chunk.decode("utf-8"))
+
+    assert str(" ".join(prediction)) == "hello hello hello hello world "
+    test_utils.unregister_model("echo_stream")
diff --git a/test/pytest/test_metrics.py b/test/pytest/test_metrics.py
index 126caaad82..27547cf471 100644
--- a/test/pytest/test_metrics.py
+++ b/test/pytest/test_metrics.py
@@ -1,6 +1,7 @@
 import glob
 import os
 import platform
+import re
 import shutil
 import time
 from os import path
@@ -9,6 +10,30 @@
 import test_utils
 
 NUM_STARTUP_CFG = 0
+FRONTEND_METRICS = [
+    "Requests2XX",
+    "Requests4XX",
+    "Requests5XX",
+    "ts_inference_requests_total",
+    "ts_inference_latency_microseconds",
+    "ts_queue_latency_microseconds",
+    "QueueTime",
+    "WorkerThreadTime",
+    "WorkerLoadTime",
+]
+SYSTEM_METRICS = [
+    "CPUUtilization",
+    "MemoryUsed",
+    "MemoryAvailable",
+    "MemoryUtilization",
+    "DiskUsage",
+    "DiskUtilization",
+    "DiskAvailable",
+    "GPUMemoryUtilization",
+    "GPUMemoryUsed",
+    "GPUUtilization",
+]
+BACKEND_METRICS = ["HandlerTime", "PredictionTime"]
 
 
 def setup_module(module):
@@ -55,6 +80,34 @@ def run_log_location_var(custom_path=test_utils.ROOT_DIR, no_config_snapshots=Fa
         assert len(glob.glob(custom_path + "/ts_log.log")) == 1
 
 
+def register_densenet161_model_and_make_inference_request():
+    test_utils.register_model("densenet161", "densenet161.mar")
+    data_file = os.path.join(
+        test_utils.REPO_ROOT, "examples/image_classifier/kitten.jpg"
+    )
+    with open(data_file, "rb") as input_data:
+        requests.post(
+            url=f"http://localhost:8080/predictions/densenet161", data=input_data
+        )
+
+
+def validate_metrics_log(log_filename, metric_names, present):
+    assert len(glob.glob("logs/" + log_filename)) == 1
+    metrics_path = glob.glob("logs/" + log_filename)[0]
+    if present:
+        assert os.path.getsize(metrics_path) > 0
+
+    metrics_regex = re.compile("|".join(metric_names), flags=re.IGNORECASE)
+    with open(metrics_path, "rt") as metrics_file:
+        metrics_data = metrics_file.read()
+        matched_metrics = re.findall(metrics_regex, metrics_data)
+
+    if present:
+        assert len(matched_metrics) > 0
+    else:
+        assert len(matched_metrics) == 0
+
+
 def test_logs_created():
     logs_created()
     global NUM_STARTUP_CFG
@@ -318,3 +371,184 @@ def test_metrics_location_var_snapshot_enabled_rdonly_dir():
         assert len(glob.glob(RDONLY_DIR + "/logs/ts_metrics.log")) == 0
     finally:
         del os.environ["METRICS_LOCATION"]
+
+
+def test_metrics_log_mode():
+    """
+    Validates that metrics uses log mode by default
+    """
+    # Torchserve cleanup
+    test_utils.stop_torchserve()
+    test_utils.delete_all_snapshots()
+    # Remove existing logs if any
+    for f in glob.glob("logs/*.log"):
+        os.remove(f)
+
+    try:
+        test_utils.start_torchserve(
+            model_store=test_utils.MODEL_STORE,
+            no_config_snapshots=True,
+            gen_mar=False,
+        )
+        register_densenet161_model_and_make_inference_request()
+        validate_metrics_log("ts_metrics.log", FRONTEND_METRICS, True)
+        validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, True)
+        validate_metrics_log("model_metrics.log", BACKEND_METRICS, True)
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.delete_all_snapshots()
+
+
+def test_metrics_prometheus_mode():
+    """
+    Validates metrics prometheus mode
+    """
+    # Torchserve cleanup
+    test_utils.stop_torchserve()
+    test_utils.delete_all_snapshots()
+    # Remove existing logs if any
+    for f in glob.glob("logs/*.log"):
+        os.remove(f)
+
+    config_file = test_utils.ROOT_DIR + "config.properties"
+    with open(config_file, "w") as f:
+        f.write("enable_envvars_config=true")
+
+    os.environ["TS_METRICS_MODE"] = "prometheus"
+
+    try:
+        test_utils.start_torchserve(
+            model_store=test_utils.MODEL_STORE,
+            snapshot_file=config_file,
+            no_config_snapshots=True,
+            gen_mar=False,
+        )
+        register_densenet161_model_and_make_inference_request()
+        validate_metrics_log("ts_metrics.log", FRONTEND_METRICS, False)
+        validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False)
+        validate_metrics_log("model_metrics.log", BACKEND_METRICS, False)
+
+        response = requests.get("http://localhost:8082/metrics")
+        prometheus_metrics = response.text
+        for metric_name in FRONTEND_METRICS:
+            assert metric_name in prometheus_metrics
+        for metric_name in SYSTEM_METRICS:
+            assert metric_name in prometheus_metrics
+        for metric_name in BACKEND_METRICS:
+            assert metric_name in prometheus_metrics
+
+        prometheus_metric_patterns = [
+            r'Requests2XX\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'ts_inference_requests_total\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+',
+            r'ts_inference_latency_microseconds\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+',
+            r'ts_queue_latency_microseconds\{model_name="densenet161",model_version="default",hostname=".+",\} \d+\.\d+',
+            r'QueueTime\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'WorkerThreadTime\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'WorkerLoadTime\{WorkerName=".+",Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'CPUUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'MemoryUsed\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'MemoryAvailable\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'MemoryUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'DiskUsage\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'DiskUtilization\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'DiskAvailable\{Level="Host",Hostname=".+",\} \d+\.\d+',
+            r'HandlerTime\{ModelName="densenet161",Level="Model",Hostname=".+",\} \d+\.\d+',
+            r'PredictionTime\{ModelName="densenet161",Level="Model",Hostname=".+",\} \d+\.\d+',
+        ]
+
+        for pattern in prometheus_metric_patterns:
+            matches = re.findall(pattern, prometheus_metrics)
+            assert len(matches) == 1
+
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.delete_all_snapshots()
+        del os.environ["TS_METRICS_MODE"]
+        os.remove(config_file)
+
+
+def test_collect_system_metrics_when_not_disabled():
+    """
+    Validates that system metrics are collected when not disabled
+    """
+    # Torchserve cleanup
+    test_utils.stop_torchserve()
+    test_utils.delete_all_snapshots()
+    # Remove existing logs if any
+    for f in glob.glob("logs/*.log"):
+        os.remove(f)
+
+    try:
+        test_utils.start_torchserve(
+            model_store=test_utils.MODEL_STORE, no_config_snapshots=True, gen_mar=False
+        )
+        register_densenet161_model_and_make_inference_request()
+        validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, True)
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.delete_all_snapshots()
+
+
+def test_disable_system_metrics_using_config_properties():
+    """
+    Validates that system metrics collection is disabled when "disable_system_metrics"
+    configuration option is set to "true"
+    """
+    # Torchserve cleanup
+    test_utils.stop_torchserve()
+    test_utils.delete_all_snapshots()
+    # Remove existing logs if any
+    for f in glob.glob("logs/*.log"):
+        os.remove(f)
+
+    config_file = test_utils.ROOT_DIR + "config.properties"
+    with open(config_file, "w") as f:
+        f.write("disable_system_metrics=true")
+
+    try:
+        test_utils.start_torchserve(
+            model_store=test_utils.MODEL_STORE,
+            snapshot_file=config_file,
+            no_config_snapshots=True,
+            gen_mar=False,
+        )
+        register_densenet161_model_and_make_inference_request()
+        validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False)
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.delete_all_snapshots()
+        os.remove(config_file)
+
+
+def test_disable_system_metrics_using_environment_variable():
+    """
+    Validates that system metrics collection is disabled when TS_DISABLE_SYSTEM_METRICS
+    environment variable is set to "true"
+    """
+    # Torchserve cleanup
+    test_utils.stop_torchserve()
+    test_utils.delete_all_snapshots()
+    # Remove existing logs if any
+    for f in glob.glob("logs/*.log"):
+        os.remove(f)
+
+    config_file = test_utils.ROOT_DIR + "config.properties"
+    with open(config_file, "w") as f:
+        f.write("enable_envvars_config=true")
+
+    os.environ["TS_DISABLE_SYSTEM_METRICS"] = "true"
+
+    try:
+        test_utils.start_torchserve(
+            model_store=test_utils.MODEL_STORE,
+            snapshot_file=config_file,
+            no_config_snapshots=True,
+            gen_mar=False,
+        )
+        register_densenet161_model_and_make_inference_request()
+        validate_metrics_log("ts_metrics.log", SYSTEM_METRICS, False)
+    finally:
+        test_utils.stop_torchserve()
+        test_utils.delete_all_snapshots()
+        del os.environ["TS_DISABLE_SYSTEM_METRICS"]
+        os.remove(config_file)
diff --git a/test/pytest/test_onnx.py b/test/pytest/test_onnx.py
index 477e57e4fa..dd466544ee 100644
--- a/test/pytest/test_onnx.py
+++ b/test/pytest/test_onnx.py
@@ -1,18 +1,7 @@
 import subprocess
 
-import pytest
 import torch
-
-try:
-    import onnx
-    import torch.onnx
-
-    print(
-        onnx.__version__
-    )  # Adding this so onnx import doesn't get removed by pre-commit
-    ONNX_ENABLED = True
-except:
-    ONNX_ENABLED = False
+import torch.onnx
 
 
 class ToyModel(torch.nn.Module):
@@ -28,7 +17,6 @@ def forward(self, x):
 
 
 # For a custom model you still need to manually author your converter, as far as I can tell there isn't a nice out of the box that exists
-@pytest.mark.skipif(ONNX_ENABLED == False, reason="ONNX is not installed")
 def test_convert_to_onnx():
     model = ToyModel()
     dummy_input = torch.randn(1, 1)
@@ -55,7 +43,6 @@ def test_convert_to_onnx():
     )
 
 
-@pytest.mark.skipif(ONNX_ENABLED == False, reason="ONNX is not installed")
 def test_model_packaging_and_start():
     subprocess.run("mkdir model_store", shell=True)
     subprocess.run(
@@ -65,7 +52,6 @@ def test_model_packaging_and_start():
     )
 
 
-@pytest.mark.skipif(ONNX_ENABLED == False, reason="ONNX is not installed")
 def test_model_start():
     subprocess.run(
         "torchserve --start --ncs --model-store model_store --models onnx.mar",
@@ -74,7 +60,6 @@ def test_model_start():
     )
 
 
-@pytest.mark.skipif(ONNX_ENABLED == False, reason="ONNX is not installed")
 def test_inference():
     subprocess.run(
         "curl -X POST http://127.0.0.1:8080/predictions/onnx --data-binary '1'",
@@ -82,6 +67,5 @@ def test_inference():
     )
 
 
-@pytest.mark.skipif(ONNX_ENABLED == False, reason="ONNX is not installed")
 def test_stop():
     subprocess.run("torchserve --stop", shell=True, check=True)
diff --git a/test/pytest/test_sm_mme_requirements.py b/test/pytest/test_sm_mme_requirements.py
new file mode 100644
index 0000000000..a667b72883
--- /dev/null
+++ b/test/pytest/test_sm_mme_requirements.py
@@ -0,0 +1,113 @@
+import os
+import pathlib
+
+import pytest
+import requests
+import test_utils
+import torch
+
+CURR_FILE_PATH = os.path.dirname(os.path.realpath(__file__))
+REPO_ROOT = os.path.normpath(os.path.join(CURR_FILE_PATH, "..", ".."))
+MODELSTORE_DIR = os.path.join(REPO_ROOT, "model_store")
+data_file_kitten = os.path.join(REPO_ROOT, "examples/image_classifier/kitten.jpg")
+HF_TRANSFORMERS_EXAMPLE_DIR = os.path.join(
+    REPO_ROOT, "examples/Huggingface_Transformers/"
+)
+
+
+def test_no_model_loaded():
+    """
+    Validates that TorchServe returns reponse code 404 if no model is loaded.
+    """
+
+    os.makedirs(MODELSTORE_DIR, exist_ok=True)  # Create modelstore directory
+    test_utils.start_torchserve(model_store=MODELSTORE_DIR)
+
+    response = requests.post(
+        url="http://localhost:8080/models/alexnet/invoke",
+        data=open(data_file_kitten, "rb"),
+    )
+    assert response.status_code == 404, "Model not loaded error expected"
+
+
+@pytest.mark.skipif(
+    not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Test to be run on GPU only",
+)
+def test_oom_on_model_load():
+    """
+    Validates that TorchServe returns reponse code 507 if there is OOM on model loading.
+    """
+
+    # Create model store directory
+    pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True)
+
+    # Start TorchServe
+    test_utils.start_torchserve(no_config_snapshots=True)
+
+    # Register model
+    params = {
+        "model_name": "BERTSeqClassification",
+        "url": "https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar",
+        "batch_size": 1,
+        "initial_workers": 16,
+    }
+    response = test_utils.register_model_with_params(params)
+
+    assert response.status_code == 507, "OOM Error expected"
+
+    test_utils.stop_torchserve()
+
+
+@pytest.mark.skipif(
+    not ((torch.cuda.device_count() > 0) and torch.cuda.is_available()),
+    reason="Test to be run on GPU only",
+)
+def test_oom_on_invoke():
+    # Create model store directory
+    pathlib.Path(test_utils.MODEL_STORE).mkdir(parents=True, exist_ok=True)
+
+    # Start TorchServe
+    test_utils.start_torchserve(no_config_snapshots=True)
+
+    # Register model
+    params = {
+        "model_name": "BERTSeqClassification",
+        "url": "https://torchserve.pytorch.org/mar_files/BERTSeqClassification.mar",
+        "batch_size": 8,
+        "initial_workers": 12,
+    }
+    response = test_utils.register_model_with_params(params)
+
+    input_text = os.path.join(
+        REPO_ROOT,
+        "examples",
+        "Huggingface_Transformers",
+        "Seq_classification_artifacts",
+        "sample_text_captum_input.txt",
+    )
+
+    # Make 8 curl requests in parallel with &
+    # Send multiple requests to make sure to hit OOM
+    for i in range(10):
+        response = os.popen(
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} && "
+            f"curl http://127.0.0.1:8080/models/BERTSeqClassification/invoke -T {input_text} "
+        )
+        response = response.read()
+
+    # If OOM is hit, we expect code 507 to be present in the response string
+    lines = response.split("\n")
+    output = ""
+    for line in lines:
+        if "code" in line:
+            line = line.strip()
+            output = line
+            break
+    assert output == '"code": 507,', "OOM Error expected"
diff --git a/test/pytest/test_torch_compile.py b/test/pytest/test_torch_compile.py
new file mode 100644
index 0000000000..9983707c8b
--- /dev/null
+++ b/test/pytest/test_torch_compile.py
@@ -0,0 +1,109 @@
+import glob
+import json
+import os
+import platform
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+import torch
+from pkg_resources import packaging
+
+PT_2_AVAILABLE = (
+    True
+    if packaging.version.parse(torch.__version__) >= packaging.version.parse("2.0")
+    else False
+)
+
+
+CURR_FILE_PATH = Path(__file__).parent
+TEST_DATA_DIR = os.path.join(CURR_FILE_PATH, "test_data", "torch_compile")
+
+MODEL_FILE = os.path.join(TEST_DATA_DIR, "model.py")
+HANDLER_FILE = os.path.join(TEST_DATA_DIR, "compile_handler.py")
+YAML_CONFIG = os.path.join(TEST_DATA_DIR, "pt2.yaml")
+
+
+SERIALIZED_FILE = os.path.join(TEST_DATA_DIR, "model.pt")
+MODEL_STORE_DIR = os.path.join(TEST_DATA_DIR, "model_store")
+MODEL_NAME = "half_plus_two"
+
+
+@pytest.mark.skipif(
+    platform.system() != "Linux", reason="Skipping test on non-Linux system"
+)
+@pytest.mark.skipif(PT_2_AVAILABLE == False, reason="torch version is < 2.0.0")
+class TestTorchCompile:
+    def teardown_class(self):
+        subprocess.run("torchserve --stop", shell=True, check=True)
+        time.sleep(10)
+
+    def test_archive_model_artifacts(self):
+        assert len(glob.glob(MODEL_FILE)) == 1
+        assert len(glob.glob(YAML_CONFIG)) == 1
+        subprocess.run(f"cd {TEST_DATA_DIR} && python model.py", shell=True, check=True)
+        subprocess.run(f"mkdir -p {MODEL_STORE_DIR}", shell=True, check=True)
+        subprocess.run(
+            f"torch-model-archiver --model-name {MODEL_NAME} --version 1.0 --model-file {MODEL_FILE} --serialized-file {SERIALIZED_FILE} --config-file {YAML_CONFIG} --export-path {MODEL_STORE_DIR} --handler {HANDLER_FILE} -f",
+            shell=True,
+            check=True,
+        )
+        assert len(glob.glob(SERIALIZED_FILE)) == 1
+        assert len(glob.glob(os.path.join(MODEL_STORE_DIR, f"{MODEL_NAME}.mar"))) == 1
+
+    def test_start_torchserve(self):
+        cmd = f"torchserve --start --ncs --models {MODEL_NAME}.mar --model-store {MODEL_STORE_DIR}"
+        subprocess.run(
+            cmd,
+            shell=True,
+            check=True,
+        )
+        time.sleep(10)
+        assert len(glob.glob("logs/access_log.log")) == 1
+        assert len(glob.glob("logs/model_log.log")) == 1
+        assert len(glob.glob("logs/ts_log.log")) == 1
+
+    def test_server_status(self):
+        result = subprocess.run(
+            "curl http://localhost:8080/ping",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+        expected_server_status_str = '{"status": "Healthy"}'
+        expected_server_status = json.loads(expected_server_status_str)
+        assert json.loads(result.stdout) == expected_server_status
+
+    def test_registered_model(self):
+        result = subprocess.run(
+            "curl http://localhost:8081/models",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+        expected_registered_model_str = '{"models": [{"modelName": "half_plus_two", "modelUrl": "half_plus_two.mar"}]}'
+        expected_registered_model = json.loads(expected_registered_model_str)
+        assert json.loads(result.stdout) == expected_registered_model
+
+    def test_serve_inference(self):
+        request_data = {"instances": [[1.0], [2.0], [3.0]]}
+        request_json = json.dumps(request_data)
+
+        result = subprocess.run(
+            f"curl -s -X POST -H \"Content-Type: application/json;\" http://localhost:8080/predictions/half_plus_two -d '{request_json}'",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        string_result = result.stdout.decode("utf-8")
+        float_result = float(string_result)
+        expected_result = 3.5
+
+        assert float_result == expected_result
+
+        model_log_path = glob.glob("logs/model_log.log")[0]
+        with open(model_log_path, "rt") as model_log_file:
+            model_log = model_log_file.read()
+            assert "Compiled model with backend inductor" in model_log
diff --git a/test/pytest/test_torch_xla.py b/test/pytest/test_torch_xla.py
new file mode 100644
index 0000000000..b42db67f79
--- /dev/null
+++ b/test/pytest/test_torch_xla.py
@@ -0,0 +1,106 @@
+import glob
+import json
+import os
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+from pkg_resources import packaging
+
+try:
+    import torch_xla
+
+    TORCHXLA_AVAILABLE = (
+        True
+        if packaging.version.parse(torch_xla.__version__)
+        >= packaging.version.parse("2.0")
+        else False
+    )
+except:
+    TORCHXLA_AVAILABLE = False
+
+CURR_FILE_PATH = Path(__file__).parent
+TORCH_XLA_TEST_DATA_DIR = os.path.join(CURR_FILE_PATH, "test_data", "torch_compile")
+
+MODEL_FILE = os.path.join(TORCH_XLA_TEST_DATA_DIR, "model.py")
+YAML_CONFIG = os.path.join(TORCH_XLA_TEST_DATA_DIR, "xla.yaml")
+CONFIG_PROPERTIES = os.path.join(TORCH_XLA_TEST_DATA_DIR, "config.properties")
+
+SERIALIZED_FILE = os.path.join(TORCH_XLA_TEST_DATA_DIR, "model.pt")
+MODEL_STORE_DIR = os.path.join(TORCH_XLA_TEST_DATA_DIR, "model_store")
+MODEL_NAME = "half_plus_two"
+
+
+@pytest.mark.skipif(TORCHXLA_AVAILABLE == False, reason="PyTorch/XLA is not installed")
+class TestTorchXLA:
+    def teardown_class(self):
+        subprocess.run("torchserve --stop", shell=True, check=True)
+        time.sleep(10)
+
+    def test_archive_model_artifacts(self):
+        assert len(glob.glob(MODEL_FILE)) == 1
+        assert len(glob.glob(YAML_CONFIG)) == 1
+        assert len(glob.glob(CONFIG_PROPERTIES)) == 1
+        subprocess.run(
+            f"cd {TORCH_XLA_TEST_DATA_DIR} && python model.py", shell=True, check=True
+        )
+        subprocess.run(f"mkdir -p {MODEL_STORE_DIR}", shell=True, check=True)
+        subprocess.run(
+            f"torch-model-archiver --model-name {MODEL_NAME} --version 1.0 --model-file {MODEL_FILE} --serialized-file {SERIALIZED_FILE} --config-file {YAML_CONFIG} --export-path {MODEL_STORE_DIR} --handler base_handler -f",
+            shell=True,
+            check=True,
+        )
+        assert len(glob.glob(SERIALIZED_FILE)) == 1
+        assert len(glob.glob(os.path.join(MODEL_STORE_DIR, f"{MODEL_NAME}.mar"))) == 1
+
+    def test_start_torchserve(self):
+        subprocess.run(
+            f"torchserve --start --ncs --models {MODEL_NAME}.mar --model-store {MODEL_STORE_DIR} --ts-config {CONFIG_PROPERTIES}",
+            shell=True,
+            check=True,
+        )
+        time.sleep(10)
+        assert len(glob.glob("logs/access_log.log")) == 1
+        assert len(glob.glob("logs/model_log.log")) == 1
+        assert len(glob.glob("logs/ts_log.log")) == 1
+
+    def test_server_status(self):
+        result = subprocess.run(
+            "curl http://localhost:8080/ping",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+        expected_server_status_str = '{"status": "Healthy"}'
+        expected_server_status = json.loads(expected_server_status_str)
+        assert json.loads(result.stdout) == expected_server_status
+
+    def test_registered_model(self):
+        result = subprocess.run(
+            "curl http://localhost:8081/models",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+        expected_registered_model_str = '{"models": [{"modelName": "half_plus_two", "modelUrl": "half_plus_two.mar"}]}'
+        expected_registered_model = json.loads(expected_registered_model_str)
+        assert json.loads(result.stdout) == expected_registered_model
+
+    def test_serve_inference(self):
+        request = "'{\"" 'instances"' ": [[1.0], [2.0], [3.0]]}'"
+        result = subprocess.run(
+            f'curl -s -X POST -H "Content-Type: application/json;" http://localhost:8080/predictions/half_plus_two -d {request}',
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+        expected_result_str = '{"predictions": [[2.5], [3.0], [3.5]]}'
+        expected_result = json.loads(expected_result_str)
+        assert json.loads(result.stdout) == expected_result
+
+        model_log_path = glob.glob("logs/model_log.log")[0]
+        with open(model_log_path, "rt") as model_log_file:
+            model_log = model_log_file.read()
+            assert "Compiled model with backend torchxla_trace_once" in model_log
+            assert "done compiler function torchxla_trace_once" in model_log
diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 40354187df..23bd45ab7b 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -5,9 +5,10 @@
 import subprocess
 import sys
 import tempfile
-import time
+import threading
 from os import path
 from pathlib import Path
+from subprocess import PIPE, STDOUT, Popen
 
 import requests
 
@@ -21,6 +22,16 @@
 CODEBUILD_WD = path.abspath(path.join(__file__, "../../.."))
 
 
+class PrintPipeTillTheEnd(threading.Thread):
+    def __init__(self, pipe):
+        super().__init__()
+        self.pipe = pipe
+
+    def run(self):
+        for line in self.pipe.stdout:
+            print(line.decode("utf-8").strip())
+
+
 def start_torchserve(
     model_store=None, snapshot_file=None, no_config_snapshots=False, gen_mar=True
 ):
@@ -36,13 +47,18 @@ def start_torchserve(
     if no_config_snapshots:
         cmd.extend(["--no-config-snapshots"])
     print(cmd)
-    subprocess.run(cmd)
-    time.sleep(10)
+
+    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
+    for line in p.stdout:
+        print(line.decode("utf8").strip())
+        if "Model server started" in str(line).strip():
+            break
+    print_thread = PrintPipeTillTheEnd(p)
+    print_thread.start()
 
 
 def stop_torchserve():
-    subprocess.run(["torchserve", "--stop"])
-    time.sleep(10)
+    subprocess.run(["torchserve", "--stop", "--foreground"])
 
 
 def delete_all_snapshots():
@@ -118,31 +134,41 @@ def model_archiver_command_builder(
     handler=None,
     extra_files=None,
     force=False,
+    config_file=None,
 ):
-    cmd = "torch-model-archiver"
+    # Initialize a list to store the command-line arguments
+    cmd_parts = ["torch-model-archiver"]
 
+    # Append arguments to the list
     if model_name:
-        cmd += " --model-name {0}".format(model_name)
+        cmd_parts.append(f"--model-name {model_name}")
 
     if version:
-        cmd += " --version {0}".format(version)
+        cmd_parts.append(f"--version {version}")
 
     if model_file:
-        cmd += " --model-file {0}".format(model_file)
+        cmd_parts.append(f"--model-file {model_file}")
 
     if serialized_file:
-        cmd += " --serialized-file {0}".format(serialized_file)
+        cmd_parts.append(f"--serialized-file {serialized_file}")
 
     if handler:
-        cmd += " --handler {0}".format(handler)
+        cmd_parts.append(f"--handler {handler}")
 
     if extra_files:
-        cmd += " --extra-files {0}".format(extra_files)
+        cmd_parts.append(f"--extra-files {extra_files}")
+
+    if config_file:
+        cmd_parts.append(f"--config-file {config_file}")
 
     if force:
-        cmd += " --force"
+        cmd_parts.append("--force")
 
-    cmd += " --export-path {0}".format(MODEL_STORE)
+    # Append the export-path argument to the list
+    cmd_parts.append(f"--export-path {MODEL_STORE}")
+
+    # Convert the list into a string to represent the complete command
+    cmd = " ".join(cmd_parts)
 
     return cmd
 
@@ -159,3 +185,9 @@ def load_module_from_py_file(py_file: str) -> object:
     loader.exec_module(module)
 
     return module
+
+
+def cleanup_model_store(model_store=None):
+    # rm -rf $MODEL_STORE_DIR / *
+    for f in glob.glob(os.path.join(model_store, "*")):
+        os.remove(f)
diff --git a/test/resources/config.properties b/test/resources/config.properties
index 7fc6a5c6f8..7787ef9f75 100644
--- a/test/resources/config.properties
+++ b/test/resources/config.properties
@@ -3,3 +3,4 @@ management_address=https://127.0.0.1:8444
 metrics_address=https://127.0.0.1:8445
 private_key_file=resources/key.pem
 certificate_file=resources/certs.pem
+install_py_dep_per_model=true
diff --git a/torchserve_sanity.py b/torchserve_sanity.py
index 27e86c5480..d92689d530 100755
--- a/torchserve_sanity.py
+++ b/torchserve_sanity.py
@@ -1,14 +1,16 @@
-from ts_scripts.modelarchiver_utils import test_modelarchiver
-from ts_scripts.workflow_archiver_utils import test_workflow_archiver
+import ts_scripts.tsutils as ts
+from ts_scripts import marsgen as mg
 from ts_scripts.backend_utils import test_torchserve
+from ts_scripts.frontend_utils import test_frontend
 from ts_scripts.install_from_src import install_from_src
-from ts_scripts.sanity_utils import test_sanity
-from ts_scripts.sanity_utils import test_workflow_sanity
+from ts_scripts.modelarchiver_utils import test_modelarchiver
+from ts_scripts.sanity_utils import (
+    test_markdown_files,
+    test_sanity,
+    test_workflow_sanity,
+)
 from ts_scripts.shell_utils import rm_dir, rm_file
-from ts_scripts.frontend_utils import test_frontend
-import ts_scripts.tsutils as ts
-import ts_scripts.print_env_info as build_hdr_printer
-from ts_scripts import marsgen as mg
+from ts_scripts.workflow_archiver_utils import test_workflow_archiver
 
 
 def torchserve_sanity():
@@ -37,22 +39,27 @@ def torchserve_sanity():
         # Run workflow sanity
         test_workflow_sanity()
 
+        # Check for broken links
+        test_markdown_files()
+
     finally:
         cleanup()
 
 
 def cleanup():
     ts.stop_torchserve()
-    rm_dir('model_store')
-    rm_dir('logs')
+    rm_dir("model_store")
+    rm_dir("logs")
 
     # clean up residual from model-archiver IT suite.
-    rm_dir('model-archiver/model_archiver/htmlcov_ut model_archiver/model-archiver/htmlcov_it')
-    rm_file('ts_scripts/*_pb2*.py', True)
+    rm_dir(
+        "model-archiver/model_archiver/htmlcov_ut model_archiver/model-archiver/htmlcov_it"
+    )
+    rm_file("ts_scripts/*_pb2*.py", True)
 
     # delete mar_gen_dir
     mg.delete_model_store_gen_dir()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     torchserve_sanity()
diff --git a/ts/arg_parser.py b/ts/arg_parser.py
index 0a1d0595e1..49aea5fcf1 100644
--- a/ts/arg_parser.py
+++ b/ts/arg_parser.py
@@ -61,7 +61,7 @@ def ts_parser():
         parser.add_argument(
             "--foreground",
             help="Run the model server in foreground. If this option is disabled, the model server"
-            " will run in the background.",
+            " will run in the background. In combination with --stop the program wait for the model server to terminate.",
             action="store_true",
         )
         parser.add_argument(
diff --git a/ts/configs/metrics.yaml b/ts/configs/metrics.yaml
index 5bb6af6ead..696ddb85e1 100644
--- a/ts/configs/metrics.yaml
+++ b/ts/configs/metrics.yaml
@@ -1,34 +1,77 @@
 dimensions:
-  - &model_name "model_name"
+  - &model_name "ModelName"
+  - &worker_name "WorkerName"
   - &level "Level"
+  - &device_id "DeviceId"
+  - &hostname "Hostname"
 
 ts_metrics:
   counter:
-    - name: CounterTsMetricExample
-      unit: ms
-      dimensions: [*model_name, *level]
+    - name: Requests2XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: Requests4XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: Requests5XX
+      unit: Count
+      dimensions: [*level, *hostname]
+    - name: ts_inference_requests_total
+      unit: Count
+      dimensions: ["model_name", "model_version", "hostname"]
+    - name: ts_inference_latency_microseconds
+      unit: Microseconds
+      dimensions: ["model_name", "model_version", "hostname"]
+    - name: ts_queue_latency_microseconds
+      unit: Microseconds
+      dimensions: ["model_name", "model_version", "hostname"]
   gauge:
-    - name: GaugeTsMetricExample
-      unit: ms
-      dimensions: [*model_name, *level]
-  histogram:
-    - name: HistogramTsMetricExample
-      unit: ms
-      dimensions: [*model_name, *level]
+    - name: QueueTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+    - name: WorkerThreadTime
+      unit: Milliseconds
+      dimensions: [*level, *hostname]
+    - name: WorkerLoadTime
+      unit: Milliseconds
+      dimensions: [*worker_name, *level, *hostname]
+    - name: CPUUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: MemoryUsed
+      unit: Megabytes
+      dimensions: [*level, *hostname]
+    - name: MemoryAvailable
+      unit: Megabytes
+      dimensions: [*level, *hostname]
+    - name: MemoryUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: DiskUsage
+      unit: Gigabytes
+      dimensions: [*level, *hostname]
+    - name: DiskUtilization
+      unit: Percent
+      dimensions: [*level, *hostname]
+    - name: DiskAvailable
+      unit: Gigabytes
+      dimensions: [*level, *hostname]
+    - name: GPUMemoryUtilization
+      unit: Percent
+      dimensions: [*level, *device_id, *hostname]
+    - name: GPUMemoryUsed
+      unit: Megabytes
+      dimensions: [*level, *device_id, *hostname]
+    - name: GPUUtilization
+      unit: Percent
+      dimensions: [*level, *device_id, *hostname]
 
 model_metrics:
-  counter:
-    - name: InferenceTimeInMS
-      unit: ms
-      dimensions: [*model_name, *level]
-    - name: NumberOfMetrics
-      unit: count
-      dimensions: [*model_name, *level]
+  # Dimension "Hostname" is automatically added for model metrics in the backend
   gauge:
-    - name: GaugeModelMetricNameExample
+    - name: HandlerTime
       unit: ms
       dimensions: [*model_name, *level]
-  histogram:
-    - name: HistogramModelMetricNameExample
+    - name: PredictionTime
       unit: ms
       dimensions: [*model_name, *level]
diff --git a/ts/context.py b/ts/context.py
index 97e79ae84b..82a7e352d9 100644
--- a/ts/context.py
+++ b/ts/context.py
@@ -21,6 +21,7 @@ def __init__(
         mms_version,
         limit_max_image_pixels=True,
         metrics=None,
+        model_yaml_config=None,
     ):
         self.model_name = model_name
         self.manifest = manifest
@@ -37,6 +38,9 @@ def __init__(
         self._metrics = None
         self._limit_max_image_pixels = True
         self.metrics = metrics
+        self.model_yaml_config = model_yaml_config
+        # add cient socket variable cl_socket to be used for send_intermediate_predict_response
+        self.cl_socket = None
 
     @property
     def system_properties(self):
diff --git a/ts/handler_utils/__init__.py b/ts/handler_utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ts/handler_utils/distributed/__init__.py b/ts/handler_utils/distributed/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ts/handler_utils/distributed/deepspeed.py b/ts/handler_utils/distributed/deepspeed.py
new file mode 100644
index 0000000000..91dd94d4c6
--- /dev/null
+++ b/ts/handler_utils/distributed/deepspeed.py
@@ -0,0 +1,42 @@
+import logging
+import os
+
+import deepspeed
+
+from ts.context import Context
+
+
+def get_ds_engine(model, ctx: Context):
+    model_dir = ctx.system_properties.get("model_dir")
+    ds_config = None
+    checkpoint = None
+    if "deepspeed" in ctx.model_yaml_config:
+        # config: the deepspeed config json file path.
+        # deepspeed config parameters:
+        # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/inference/config.py
+        if "config" in ctx.model_yaml_config["deepspeed"]:
+            ds_config = os.path.join(
+                model_dir, ctx.model_yaml_config["deepspeed"]["config"]
+            )
+            if not os.path.exists(ds_config):
+                raise ValueError(
+                    f"{ctx.model_name} has no deepspeed config file {ds_config}"
+                )
+
+        if "checkpoint" in ctx.model_yaml_config:
+            checkpoint = os.path.join(
+                model_dir, ctx.model_yaml_config["deepspeed"]["checkpoint"]
+            )
+            if not os.path.exists(checkpoint):
+                raise ValueError(
+                    f"{ctx.model_name} has no deepspeed checkpoint file {checkpoint}"
+                )
+        logging.debug("Creating DeepSpeed engine")
+        ds_engine = deepspeed.init_inference(
+            model, config=ds_config, checkpoint=checkpoint
+        )
+        return ds_engine
+    else:
+        raise ValueError(
+            f"{ctx.model_name} has no deepspeed config in model config yaml file"
+        )
diff --git a/ts/handler_utils/distributed/pt_pippy.py b/ts/handler_utils/distributed/pt_pippy.py
new file mode 100644
index 0000000000..11d3f37793
--- /dev/null
+++ b/ts/handler_utils/distributed/pt_pippy.py
@@ -0,0 +1,134 @@
+import importlib
+import inspect
+import logging
+import os
+
+import pippy
+import torch
+import torch.distributed.rpc as rpc
+
+pippy_installed = importlib.util.find_spec("pippy") is not None
+
+if pippy_installed:
+    from pippy import split_into_equal_size
+    from pippy.hf import PiPPyHFTracer, inject_pipeline_forward
+
+
+logger = logging.getLogger(__name__)
+
+
+def initialize_rpc_workers(local_rank, world_size, ctx):
+    # Get RPC configuration options from model YAML config
+    rpc_timeout = ctx.model_yaml_config["pippy"]["rpc_timeout"]
+    num_worker_threads = ctx.model_yaml_config["pippy"]["num_worker_threads"]
+    options = rpc.TensorPipeRpcBackendOptions(
+        num_worker_threads=num_worker_threads, rpc_timeout=rpc_timeout
+    )
+
+    # Set up device mapping for RPC workers
+    n_devs = torch.cuda.device_count()
+    dev_id = local_rank % n_devs
+    for i in range(world_size):
+        options.set_device_map(f"worker{i}", {dev_id: i % n_devs})
+
+    # Initialize RPC worker
+    logger.info(f"rank = {local_rank} pid/device = " f"{os.getpid()}/{dev_id}")
+    rpc.init_rpc(
+        f"worker{local_rank}",
+        rank=local_rank,
+        world_size=world_size,
+        rpc_backend_options=options,
+    )
+
+
+def get_pipeline_driver(model, world_size, ctx):
+    """Returns a pipeline driver for the given model.
+    Args:
+        model (torch.nn.Module): The model to pipeline.
+        world_size (int): The number of pipeline stages.
+        ctx (Context): The context containing configuration information.
+    Returns:
+        torch.nn.Sequential: The pipeline driver for the model.
+    """
+    # Extract configuration parameters from the context
+
+    # Check that the "pippy" and "handler" keys are present in the YAML config
+    assert "pippy" in ctx.model_yaml_config, "Missing 'pippy' key in YAML config"
+    assert "handler" in ctx.model_yaml_config, "Missing 'handler' key in YAML config"
+
+    # Check that the required keys are present in the "pippy" section
+
+    assert (
+        "input_names" in ctx.model_yaml_config["pippy"]
+    ), "Missing 'input_names' key in YAML config"
+    assert (
+        "model_type" in ctx.model_yaml_config["pippy"]
+    ), "Missing 'model_type' key in YAML config"
+
+    # Check that the required keys are present in the "handler" section
+    assert (
+        "model_path" in ctx.model_yaml_config["handler"]
+    ), "Missing 'model_path' key in YAML config"
+
+    # Set variables from the config
+
+    input_names = ctx.model_yaml_config["pippy"]["input_names"]
+    model_type = ctx.model_yaml_config["pippy"]["model_type"]
+    model_path = ctx.model_yaml_config["handler"]["model_path"]
+    try:
+        chunks = ctx.model_yaml_config["pippy"]["chunks"]
+    except KeyError:
+        chunks = 1
+    try:
+        index_filename = ctx.model_yaml_config["handler"]["index_filename"]
+    except KeyError:
+        index_filename = None
+
+    # Check that the index file exists
+    if index_filename is not None:
+        index_file_path = os.path.join(model_path, index_filename)
+        assert os.path.exists(
+            index_file_path
+        ), f"Index file '{index_file_path}' not found"
+    else:
+        index_file_path = None
+
+    checkpoint_prefix = None
+    # Set the model to evaluation mode
+    model.eval()
+
+    # Extract the concrete arguments for the model's forward method
+    sig = inspect.signature(model.forward)
+    concrete_args = {
+        p.name: p.default for p in sig.parameters.values() if p.name not in input_names
+    }
+
+    logger.info("Initializing the model pipeline")
+
+    # Create a tracer if necessary
+    tracer = PiPPyHFTracer() if model_type == "HF" else None
+
+    # Add deprecated_arguments to concrete_args if necessary
+    if model_type == "HF" and "bloom" in str(model.__class__):
+        concrete_args.setdefault("deprecated_arguments", {})
+
+    # Compile the pipeline using PiPPy
+    split_policy = split_into_equal_size(world_size)
+    pipe_driver, stage_mode = pippy.all_compile(
+        model,
+        num_ranks=world_size,
+        num_chunks=chunks,
+        schedule="FillDrain",
+        split_policy=split_policy,
+        tracer=tracer,
+        concrete_args=concrete_args,
+        index_filename=index_file_path,
+        checkpoint_prefix=checkpoint_prefix,
+    )
+
+    # Inject the pipeline forward method if necessary
+    if model_type == "HF":
+        inject_pipeline_forward(model, pipe_driver)
+        return model
+    else:
+        return pipe_driver
diff --git a/ts/handler_utils/micro_batching.py b/ts/handler_utils/micro_batching.py
new file mode 100644
index 0000000000..ae9671f707
--- /dev/null
+++ b/ts/handler_utils/micro_batching.py
@@ -0,0 +1,177 @@
+import os
+import queue
+import threading
+import time
+from copy import copy
+from dataclasses import dataclass
+from typing import Dict
+
+try:
+    PROFILER_AVAILABLE = True
+except ImportError:
+    PROFILER_AVAILABLE = False
+
+
+HANDLER_METHODS = ["preprocess", "inference", "postprocess"]
+
+
+def execute_call(in_queue, out_queue, handle, event):
+    while not event.is_set():
+        try:
+            idx, in_data = in_queue.get(timeout=0.5)
+        except queue.Empty:
+            continue
+        out_data = handle(in_data)
+        out_queue.put((idx, out_data))
+
+
+@dataclass
+class WorkerThread:
+    event: threading.Event
+    thread: threading.Thread
+
+
+class MicroBatching(object):
+    def __init__(
+        self, parent_handler, micro_batch_size: int = 1, parallelism: Dict = None
+    ):
+        self.handler = parent_handler
+        self.micro_batch_size = micro_batch_size
+        self._parallelism = parallelism if parallelism is not None else {}
+        self.thread_groups = {c: [] for c in HANDLER_METHODS}
+        self.queues = {}
+        self.terminate = threading.Event()
+        self._create_queues()
+        self._update_threads()
+
+    def __del__(self):
+        self.shutdown()
+
+    @property
+    def parallelism(self) -> Dict:
+        return copy(self._parallelism)
+
+    @parallelism.setter
+    def parallelism(self, parallelism: Dict):
+        """Set number of threads for each of the processing steps.
+
+        Args:
+            parallelism (Dict): New number of threads per processing step
+
+        Returns:
+            None
+        """
+        assert all(k in HANDLER_METHODS for k in parallelism.keys())
+
+        self._parallelism.update(parallelism)
+        self._update_threads()
+
+    def shutdown(self):
+        """Shuts down all running threads.
+
+        Args:
+            None
+
+        Returns:
+            None
+        """
+        for _, tg in self.thread_groups.items():
+            for t in tg:
+                t.event.set()
+                t.thread.join()
+
+    def _create_queues(self):
+        # Set up processing queues
+        self.queues[HANDLER_METHODS[0] + "_in"] = queue.Queue()
+        for i in range(len(HANDLER_METHODS) - 1):
+            # Each "out" queue is the "in" queue of the next processing step
+            self.queues[HANDLER_METHODS[i] + "_out"] = queue.Queue()
+            self.queues[HANDLER_METHODS[i + 1] + "_in"] = self.queues[
+                HANDLER_METHODS[i] + "_out"
+            ]
+        self.queues[HANDLER_METHODS[-1] + "_out"] = queue.Queue()
+
+    def _update_threads(self):
+        for c in HANDLER_METHODS:
+            tgt_parallelism = self._parallelism.get(c, 1)
+            assert tgt_parallelism >= 0
+            cur_parallelism = lambda: len(self.thread_groups[c])
+
+            # Scale up threads if necessary
+            while tgt_parallelism > cur_parallelism():
+                in_queue = self.queues[c + "_in"]
+                out_queue = self.queues[c + "_out"]
+                call = getattr(self.handler, c)
+                event = threading.Event()
+
+                t = threading.Thread(
+                    target=execute_call,
+                    args=(in_queue, out_queue, call, event),
+                )
+                t.start()
+                self.thread_groups[c].append(WorkerThread(event, t))
+
+            # Scale down threads if necessary
+            while tgt_parallelism < cur_parallelism():
+                self.thread_groups[c][-1].event.set()
+                self.thread_groups[c][-1].thread.join()
+                self.thread_groups[c].pop()
+
+    def handle(self, data):
+        num_batches = 0
+        for idx, i in enumerate(range(0, len(data), self.micro_batch_size)):
+            self.queues[HANDLER_METHODS[0] + "_in"].put_nowait(
+                (idx, data[i : i + self.micro_batch_size])
+            )
+            num_batches += 1
+
+        output = []
+        while len(output) != num_batches:
+            output.append(self.queues[HANDLER_METHODS[-1] + "_out"].get())
+
+        return [item for batch in sorted(output) for item in batch[1]]
+
+    def __call__(self, data, context):
+        """Entry point for default handler. It takes the data from the input request and returns
+           the predicted outcome for the input. This method is a modified variant from the BaseHandler.
+           It calls the MicroBatching handle method instead of running the single processing steps.
+
+        Args:
+            data (list): The input data that needs to be made a prediction request on.
+            context (Context): It is a JSON Object containing information pertaining to
+                               the model artefacts parameters.
+
+        Returns:
+            list : Returns a list of dictionary with the predicted response.
+        """
+
+        # It can be used for pre or post processing if needed as additional request
+        # information is available in context
+        start_time = time.time()
+
+        self.handler.context = context
+        metrics = self.handler.context.metrics
+
+        is_profiler_enabled = os.environ.get("ENABLE_TORCH_PROFILER", None)
+        if is_profiler_enabled:
+            if PROFILER_AVAILABLE:
+                output, _ = self.handler._infer_with_profiler(data=data)
+            else:
+                raise RuntimeError(
+                    "Profiler is enabled but current version of torch does not support."
+                    "Install torch>=1.8.1 to use profiler."
+                )
+        else:
+            if self.handler._is_describe():
+                output = [self.handler.describe_handle()]
+            elif self.handler._is_explain():
+                data_preprocess = self.handler.preprocess(data)
+                output = self.handler.explain_handle(data_preprocess, data)
+            else:
+                output = self.handle(data)
+
+        stop_time = time.time()
+        metrics.add_time(
+            "HandlerTime", round((stop_time - start_time) * 1000, 2), None, "ms"
+        )
+        return output
diff --git a/ts/model_server.py b/ts/model_server.py
index bf3a2e9b8f..2a50932fa5 100644
--- a/ts/model_server.py
+++ b/ts/model_server.py
@@ -9,7 +9,7 @@
 import sys
 import tempfile
 from builtins import str
-
+from typing import Dict
 import psutil
 
 from ts.arg_parser import ArgParser
@@ -18,7 +18,7 @@
 TS_NAMESPACE = "org.pytorch.serve.ModelServer"
 
 
-def start():
+def start() -> None:
     """
     This is the entry point for model server
     :return:
@@ -47,7 +47,13 @@ def start():
             try:
                 parent = psutil.Process(pid)
                 parent.terminate()
-                print("TorchServe has stopped.")
+                if args.foreground:
+                    try:
+                        parent.wait(timeout=60)
+                    except psutil.TimeoutExpired:
+                        print("Stopping TorchServe took too long.")
+                else:
+                    print("TorchServe has stopped.")
             except (OSError, psutil.Error):
                 print("TorchServe already stopped.")
             os.remove(pid_file)
@@ -98,7 +104,8 @@ def start():
                 sys.exit(1)
             ts_conf_file = ts_config
 
-        platform_path_separator = {"Windows": "", "Darwin": ".:", "Linux": ".:"}
+        platform_path_separator = {
+            "Windows": "", "Darwin": ".:", "Linux": ".:"}
         class_path = "{}{}".format(
             platform_path_separator[platform.system()],
             os.path.join(ts_home, "ts", "frontend", "*"),
@@ -201,7 +208,7 @@ def start():
                 print("start java frontend failed:", sys.exc_info())
 
 
-def load_properties(file_path):
+def load_properties(file_path: str) -> Dict[str, str]:
     """
     Read properties file into map.
     """
@@ -214,7 +221,6 @@ def load_properties(file_path):
                 if len(pair) > 1:
                     key = pair[0].strip()
                     props[key] = pair[1].strip()
-
     return props
 
 
diff --git a/ts/model_service_worker.py b/ts/model_service_worker.py
index 9b18fd6038..57e8bf9a7c 100644
--- a/ts/model_service_worker.py
+++ b/ts/model_service_worker.py
@@ -10,7 +10,6 @@
 import platform
 import socket
 import sys
-import uuid
 
 from ts.arg_parser import ArgParser
 from ts.metrics.metric_cache_yaml_impl import MetricsCacheYamlImpl
@@ -22,6 +21,10 @@
 DEBUG = False
 BENCHMARK = os.getenv("TS_BENCHMARK")
 BENCHMARK = BENCHMARK in ["True", "true", "TRUE"]
+LOCAL_RANK = int(os.getenv("LOCAL_RANK", 0))
+WORLD_SIZE = int(os.getenv("WORLD_SIZE", 0))
+WORLD_RANK = int(os.getenv("RANK", 0))
+LOCAL_WORLD_SIZE = int(os.getenv("LOCAL_WORLD_SIZE", 0))
 
 
 class TorchModelServiceWorker(object):
@@ -42,31 +45,41 @@ def __init__(
         if s_type == "unix":
             if s_name is None:
                 raise ValueError("Wrong arguments passed. No socket name given.")
-            self.sock_name, self.port = s_name, -1
+            s_name_parts = s_name.rsplit(".", 1)
+            logging.info(
+                "s_name_part0=%s, s_name_part1=%s, pid=%d",
+                s_name_parts[0],
+                s_name_parts[1],
+                os.getpid(),
+            )
+            s_name_new = s_name_parts[0] + "." + str(int(s_name_parts[1]) + LOCAL_RANK)
+            self.sock_name, self.port = s_name_new, -1
             try:
-                os.remove(s_name)
+                os.remove(s_name_new)
             except OSError as e:
-                if os.path.exists(s_name):
+                if os.path.exists(s_name_new):
                     raise RuntimeError(
-                        "socket already in use: {}.".format(s_name)
+                        "socket already in use: {}.".format(s_name_new)
                     ) from e
-
+            logging.info("Listening on port: %s", s_name_new)
         elif s_type == "tcp":
             self.sock_name = host_addr if host_addr is not None else "127.0.0.1"
             if port_num is None:
                 raise ValueError("Wrong arguments passed. No socket port given.")
-            self.port = port_num
+            self.port = int(port_num) + LOCAL_RANK
+            logging.info("Listening on addr:port: %s:%d", self.sock_name, self.port)
         else:
             raise ValueError("Incomplete data provided")
 
-        logging.info("Listening on port: %s", s_name)
         socket_family = socket.AF_INET if s_type == "tcp" else socket.AF_UNIX
         self.sock = socket.socket(socket_family, socket.SOCK_STREAM)
         self.metrics_cache = MetricsCacheYamlImpl(config_file_path=metrics_config)
         if self.metrics_cache:
             self.metrics_cache.initialize_cache()
         else:
-            raise RuntimeError(f"Failed to initialize metrics from file {metrics_config}")
+            raise RuntimeError(
+                f"Failed to initialize metrics from file {metrics_config}"
+            )
 
     def load_model(self, load_model_request):
         """
@@ -123,14 +136,31 @@ def load_model(self, load_model_request):
                 batch_size,
                 envelope,
                 limit_max_image_pixels,
-                self.metrics_cache
+                self.metrics_cache,
             )
 
             logging.debug("Model %s loaded.", model_name)
 
             return service, "loaded model {}".format(model_name), 200
-        except MemoryError:
+        except MemoryError as ex:
+            logging.exception(
+                "Load model %s cpu OOM, exception %s", model_name, str(ex)
+            )
             return None, "System out of memory", 507
+        except RuntimeError as ex:  # pylint: disable=broad-except
+            if "CUDA" in str(ex):
+                # Handles Case A: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED (Close to OOM) &
+                # Case B: CUDA out of memory (OOM)
+                logging.exception(
+                    "Load model %s cuda OOM, exception %s", model_name, str(ex)
+                )
+                return None, "System out of memory", 507
+            else:
+                # Sanity testcases fail without this
+                logging.exception(
+                    "Failed to load model %s, exception %s", model_name, str(ex)
+                )
+                return None, "Unknown exception", 500
 
     def handle_connection(self, cl_socket):
         """
@@ -157,6 +187,7 @@ def handle_connection(self, cl_socket):
                 cl_socket.sendall(resp)
                 if code != 200:
                     raise RuntimeError("{} - {}".format(code, result))
+                service.set_cl_socket(cl_socket)
             else:
                 raise ValueError("Received unknown command: {}".format(cmd))
 
@@ -176,6 +207,7 @@ def run_server(self):
             self.sock.bind((self.sock_name, int(self.port)))
 
         self.sock.listen(1)
+
         logging.info("[PID]%d", os.getpid())
         logging.info("Torch worker started.")
         logging.info("Python runtime: %s", platform.python_version())
diff --git a/ts/protocol/otf_message_handler.py b/ts/protocol/otf_message_handler.py
index d88f49e782..d05c472b6b 100644
--- a/ts/protocol/otf_message_handler.py
+++ b/ts/protocol/otf_message_handler.py
@@ -50,7 +50,9 @@ def encode_response_headers(resp_hdr_map):
     return msg
 
 
-def create_predict_response(ret, req_id_map, message, code, context=None):
+def create_predict_response(
+    ret, req_id_map, message, code, context=None, ts_stream_next=False
+):
     """
     Create inference response.
 
@@ -77,6 +79,12 @@ def create_predict_response(ret, req_id_map, message, code, context=None):
         if context is None:
             msg += struct.pack("!i", 0)  # content_type
         else:
+            if ts_stream_next is True:
+                context.set_response_header(idx, "ts_stream_next", "true")
+            else:
+                if "true" == context.get_response_headers(idx).get("ts_stream_next"):
+                    context.set_response_header(idx, "ts_stream_next", "false")
+
             content_type = context.get_response_content_type(idx)
             if content_type is None or len(content_type) == 0:
                 msg += struct.pack("!i", 0)  # content_type
@@ -342,3 +350,8 @@ def _retrieve_input_data(conn):
         model_input["value"] = value
 
     return model_input
+
+
+def send_intermediate_predict_response(ret, req_id_map, message, code, context=None):
+    msg = create_predict_response(ret, req_id_map, message, code, context, True)
+    context.cl_socket.sendall(msg)
diff --git a/ts/service.py b/ts/service.py
index c20fd79bed..14b72ac3ec 100644
--- a/ts/service.py
+++ b/ts/service.py
@@ -2,13 +2,14 @@
 CustomService class definitions
 """
 import logging
+import os
 import time
 from builtins import str
 
 import ts
 from ts.context import Context, RequestProcessor
 from ts.protocol.otf_message_handler import create_predict_response
-from ts.utils.util import PredictionException
+from ts.utils.util import PredictionException, get_yaml_config
 
 PREDICTION_METRIC = "PredictionTime"
 logger = logging.getLogger(__name__)
@@ -30,6 +31,15 @@ def __init__(
         limit_max_image_pixels=True,
         metrics_cache=None,
     ):
+        model_yaml_config = {}
+        if manifest is not None and "model" in manifest:
+            model = manifest["model"]
+            if "configFile" in model:
+                model_yaml_config_file = model["configFile"]
+                model_yaml_config = get_yaml_config(
+                    os.path.join(model_dir, model_yaml_config_file)
+                )
+
         self._context = Context(
             model_name,
             model_dir,
@@ -39,6 +49,7 @@ def __init__(
             ts.__version__,
             limit_max_image_pixels,
             metrics_cache,
+            model_yaml_config,
         )
         self._entry_point = entry_point
 
@@ -96,6 +107,9 @@ def retrieve_data_for_inference(batch):
 
         return headers, input_batch, req_to_id_map
 
+    def set_cl_socket(self, cl_socket):
+        self.context.cl_socket = cl_socket
+
     def predict(self, batch):
         """
         PREDICT COMMAND = {
@@ -118,15 +132,21 @@ def predict(self, batch):
         # noinspection PyBroadException
         try:
             ret = self._entry_point(input_batch, self.context)
-        except PredictionException as e:
-            logger.error("Prediction error", exc_info=True)
-            return create_predict_response(None, req_id_map, e.message, e.error_code)
         except MemoryError:
             logger.error("System out of memory", exc_info=True)
             return create_predict_response(None, req_id_map, "Out of resources", 507)
-        except Exception:  # pylint: disable=broad-except
-            logger.warning("Invoking custom service failed.", exc_info=True)
-            return create_predict_response(None, req_id_map, "Prediction failed", 503)
+        except PredictionException as e:
+            logger.error("Prediction error", exc_info=True)
+            return create_predict_response(None, req_id_map, e.message, e.error_code)
+        except Exception as ex:  # pylint: disable=broad-except
+            if "CUDA" in str(ex):
+                # Handles Case A: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED (Close to OOM) &
+                # Case B: CUDA out of memory (OOM)
+                logger.error("CUDA out of memory", exc_info=True)
+                return create_predict_response(None, req_id_map, "Out of resources", 507)
+            else:
+                logger.warning("Invoking custom service failed.", exc_info=True)
+                return create_predict_response(None, req_id_map, "Prediction failed", 503)
 
         if not isinstance(ret, list):
             logger.warning(
diff --git a/ts/tests/unit_tests/test_micro_batching.py b/ts/tests/unit_tests/test_micro_batching.py
new file mode 100644
index 0000000000..001de6411b
--- /dev/null
+++ b/ts/tests/unit_tests/test_micro_batching.py
@@ -0,0 +1,181 @@
+"""
+Unit test for MicroBatchHandler class.
+"""
+import json
+import random
+import sys
+from pathlib import Path
+
+import pytest
+from torchvision.models.resnet import ResNet18_Weights
+
+from ts.torch_handler.image_classifier import ImageClassifier
+from ts.torch_handler.unit_tests.test_utils.mock_context import MockContext
+from ts.torch_handler.unit_tests.test_utils.model_dir import copy_files, download_model
+
+REPO_DIR = Path(__file__).parents[3]
+
+
+def read_image_bytes(filename):
+    with open(
+        filename,
+        "rb",
+    ) as fin:
+        image_bytes = fin.read()
+    return image_bytes
+
+
+@pytest.fixture(scope="module")
+def kitten_image_bytes():
+    return read_image_bytes(
+        REPO_DIR.joinpath(
+            "examples/image_classifier/resnet_152_batch/images/kitten.jpg"
+        ).as_posix()
+    )
+
+
+@pytest.fixture(scope="module")
+def dog_image_bytes():
+    return read_image_bytes(
+        REPO_DIR.joinpath(
+            "examples/image_classifier/resnet_152_batch/images/dog.jpg"
+        ).as_posix()
+    )
+
+
+@pytest.fixture(scope="module")
+def model_name():
+    return "image_classifier"
+
+
+@pytest.fixture(scope="module")
+def model_dir(tmp_path_factory, model_name):
+    model_dir = tmp_path_factory.mktemp("image_classifier_model_dir")
+
+    src_dir = REPO_DIR.joinpath("examples/image_classifier/resnet_18/")
+
+    model_url = ResNet18_Weights.DEFAULT.url
+
+    download_model(model_url, model_dir)
+
+    files = {
+        "model.py": model_name + ".py",
+        "index_to_name.json": "index_to_name.json",
+    }
+
+    copy_files(src_dir, model_dir, files)
+
+    sys.path.append(model_dir.as_posix())
+    yield model_dir
+    sys.path.pop()
+
+
+@pytest.fixture(scope="module")
+def context(model_dir, model_name):
+    micro_batching_params = {
+        "mb_size": 2,
+        "mb_parallelism": {
+            "preprocess": 1,
+            "inference": 2,
+            "postprocess": 3,
+        },
+    }
+
+    config_file = Path(model_dir).joinpath("micro_batching.json")
+
+    with open(config_file, "w") as f:
+        json.dump(micro_batching_params, f)
+
+    context = MockContext(
+        model_name="mnist",
+        model_dir=model_dir.as_posix(),
+        model_file=model_name + ".py",
+    )
+    context.model_yaml_config = micro_batching_params
+    yield context
+
+
+@pytest.fixture(scope="module", params=[1, 8])
+def handler(context, request):
+    handler = ImageClassifier()
+
+    from ts.handler_utils.micro_batching import MicroBatching
+
+    mb_handle = MicroBatching(handler, micro_batch_size=request.param)
+    handler.initialize(context)
+
+    handler.handle = mb_handle
+    handler.handle.parallelism = context.model_yaml_config["mb_parallelism"]
+
+    yield handler
+
+    mb_handle.shutdown()
+
+
+@pytest.fixture(scope="module", params=[1, 16])
+def mixed_batch(kitten_image_bytes, dog_image_bytes, request):
+    batch_size = request.param
+    labels = [
+        "tiger_cat" if random.random() > 0.5 else "golden_retriever"
+        for _ in range(batch_size)
+    ]
+    test_data = []
+    for l in labels:
+        test_data.append(
+            {"data": kitten_image_bytes}
+            if l == "tiger_cat"
+            else {"data": dog_image_bytes}
+        )
+    return test_data, labels
+
+
+def test_handle(context, mixed_batch, handler):
+    test_data, labels = mixed_batch
+    results = handler.handle(test_data, context)
+    assert len(results) == len(labels)
+    for l, r in zip(labels, results):
+        assert l in r
+
+
+def test_handle_explain(context, kitten_image_bytes, handler):
+    context.explain = True
+    test_data = [{"data": kitten_image_bytes, "target": 0}] * 2
+    results = handler.handle(test_data, context)
+    assert len(results) == 2
+    assert results[0]
+
+
+def test_micro_batching_handler_threads(handler):
+    assert len(handler.handle.thread_groups["preprocess"]) == 1
+    assert len(handler.handle.thread_groups["inference"]) == 2
+    assert len(handler.handle.thread_groups["postprocess"]) == 3
+
+
+def test_spin_up_down_threads(handler):
+    assert len(handler.handle.thread_groups["preprocess"]) == 1
+    assert len(handler.handle.thread_groups["inference"]) == 2
+    assert len(handler.handle.thread_groups["postprocess"]) == 3
+
+    new_parallelism = {
+        "preprocess": 2,
+        "inference": 3,
+        "postprocess": 4,
+    }
+
+    handler.handle.parallelism = new_parallelism
+
+    assert len(handler.handle.thread_groups["preprocess"]) == 2
+    assert len(handler.handle.thread_groups["inference"]) == 3
+    assert len(handler.handle.thread_groups["postprocess"]) == 4
+
+    new_parallelism = {
+        "preprocess": 1,
+        "inference": 2,
+        "postprocess": 3,
+    }
+
+    handler.handle.parallelism = new_parallelism
+
+    assert len(handler.handle.thread_groups["preprocess"]) == 1
+    assert len(handler.handle.thread_groups["inference"]) == 2
+    assert len(handler.handle.thread_groups["postprocess"]) == 3
diff --git a/ts/tests/unit_tests/test_model_service_worker.py b/ts/tests/unit_tests/test_model_service_worker.py
index b0bd4ac7cb..a17ede9650 100644
--- a/ts/tests/unit_tests/test_model_service_worker.py
+++ b/ts/tests/unit_tests/test_model_service_worker.py
@@ -43,7 +43,7 @@ def socket_patches(mocker):
 def model_service_worker(socket_patches):
     if not sys.platform.startswith("win"):
         model_service_worker = TorchModelServiceWorker(
-            "unix", "my-socket", None, None, metrics_config_path
+            "unix", "my-socket.9999", None, None, metrics_config_path
         )
     else:
         model_service_worker = TorchModelServiceWorker(
@@ -59,7 +59,7 @@ def model_service_worker(socket_patches):
     sys.platform.startswith("win"), reason="Skipping linux/darwin specific test cases"
 )
 class TestInit:
-    socket_name = "sampleSocketName"
+    socket_name = "sampleSocketName.9999"
 
     def test_missing_socket_name(self):
         with pytest.raises(ValueError, match="Incomplete data provided.*"):
@@ -72,7 +72,7 @@ def test_socket_in_use(self, mocker):
         path_exists.return_value = True
 
         with pytest.raises(
-            Exception, match=r".*socket already in use: sampleSocketName.*"
+            Exception, match=r".*socket already in use: sampleSocketName.9999.*"
         ):
             TorchModelServiceWorker(
                 "unix", self.socket_name, None, None, metrics_config_path
diff --git a/ts/torch_handler/base_handler.py b/ts/torch_handler/base_handler.py
index 4fc3ca19ee..08405e79fd 100644
--- a/ts/torch_handler/base_handler.py
+++ b/ts/torch_handler/base_handler.py
@@ -13,8 +13,8 @@
 from pkg_resources import packaging
 
 from ..utils.util import (
+    check_valid_pt2_backend,
     list_classes_from_module,
-    load_compiler_config,
     load_label_mapping,
 )
 
@@ -28,35 +28,71 @@
 
 logger = logging.getLogger(__name__)
 
-# Possible values for backend in utils.py
-def check_pt2_enabled():
-    try:
-        import torch._dynamo
-
-        pt2_enabled = True
-        if torch.cuda.is_available():
-            # If Ampere enable tensor cores which will give better performance
-            # Ideally get yourself an A10G or A100 for optimal performance
-            if torch.cuda.get_device_capability() >= (8, 0):
-                torch.backends.cuda.matmul.allow_tf32 = True
-    except ImportError as error:
-        logger.warning(
-            "dynamo/inductor are not installed. \n For GPU please run pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117 \n for CPU please run pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu"
-        )
-        pt2_enabled = False
-    return pt2_enabled
+
+try:
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+except ImportError as error:
+    XLA_AVAILABLE = False
+
+
+if packaging.version.parse(torch.__version__) >= packaging.version.parse("2.0.0a"):
+    PT2_AVAILABLE = True
+    if torch.cuda.is_available():
+        # If Ampere enable tensor cores which will give better performance
+        # Ideally get yourself an A10G or A100 for optimal performance
+        if torch.cuda.get_device_capability() >= (8, 0):
+            torch.set_float32_matmul_precision("high")
+            logger.info("Enabled tensor cores")
+else:
+    logger.warning(
+        f"Your torch version is {torch.__version__} which does not support torch.compile"
+    )
+    PT2_AVAILABLE = False
 
 
-ipex_enabled = False
 if os.environ.get("TS_IPEX_ENABLE", "false") == "true":
     try:
         import intel_extension_for_pytorch as ipex
 
-        ipex_enabled = True
+        IPEX_AVAILABLE = True
     except ImportError as error:
         logger.warning(
             "IPEX is enabled but intel-extension-for-pytorch is not installed. Proceeding without IPEX."
         )
+        IPEX_AVAILABLE = False
+else:
+    IPEX_AVAILABLE = False
+
+
+try:
+    import onnxruntime as ort
+    import psutil
+
+    logger.info("ONNX enabled")
+    ONNX_AVAILABLE = True
+except ImportError as error:
+    logger.warning("proceeding without onnxruntime")
+    ONNX_AVAILABLE = False
+
+
+def setup_ort_session(model_pt_path, map_location):
+    providers = (
+        ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        if map_location == "cuda"
+        else ["CPUExecutionProvider"]
+    )
+
+    sess_options = ort.SessionOptions()
+    sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
+
+    # Start an inference session
+    ort_session = ort.InferenceSession(
+        model_pt_path, providers=providers, sess_options=sess_options
+    )
+
+    return ort_session
 
 
 class BaseHandler(abc.ABC):
@@ -90,28 +126,22 @@ def initialize(self, context):
             RuntimeError: Raises the Runtime error when the model.py is missing
 
         """
-        ipex_enabled = False
-        if os.environ.get("TS_IPEX_ENABLE", "false") == "true":
-            try:
-                import intel_extension_for_pytorch as ipex
 
-                ipex_enabled = True
-            except ImportError as error:
-                logger.warning(
-                    "IPEX is enabled but intel-extension-for-pytorch is not installed. Proceeding without IPEX."
-                )
+        if context is not None and hasattr(context, "model_yaml_config"):
+            self.model_yaml_config = context.model_yaml_config
 
         properties = context.system_properties
-        self.map_location = (
-            "cuda"
-            if torch.cuda.is_available() and properties.get("gpu_id") is not None
-            else "cpu"
-        )
-        self.device = torch.device(
-            self.map_location + ":" + str(properties.get("gpu_id"))
-            if torch.cuda.is_available() and properties.get("gpu_id") is not None
-            else self.map_location
-        )
+        if torch.cuda.is_available() and properties.get("gpu_id") is not None:
+            self.map_location = "cuda"
+            self.device = torch.device(
+                self.map_location + ":" + str(properties.get("gpu_id"))
+            )
+        elif XLA_AVAILABLE:
+            self.device = xm.xla_device()
+        else:
+            self.map_location = "cpu"
+            self.device = torch.device(self.map_location)
+
         self.manifest = context.manifest
 
         model_dir = properties.get("model_dir")
@@ -119,20 +149,6 @@ def initialize(self, context):
         if "serializedFile" in self.manifest["model"]:
             serialized_file = self.manifest["model"]["serializedFile"]
             self.model_pt_path = os.path.join(model_dir, serialized_file)
-
-        if self.model_pt_path:
-            if self.model_pt_path.endswith("onnx"):
-                try:
-                    # import numpy as np
-                    import onnxruntime as ort
-                    import psutil
-
-                    onnx_enabled = True
-                    logger.info("ONNX enabled")
-                except ImportError as error:
-                    onnx_enabled = False
-                    logger.warning("proceeding without onnxruntime")
-
         # model def file
         model_file = self.manifest["model"].get("modelFile", "")
 
@@ -151,47 +167,38 @@ def initialize(self, context):
             self.model.eval()
 
         # Convert your model by following instructions: https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html
-        # TODO(msaroufim): Refactor into utils https://github.com/pytorch/serve/issues/1631
-        elif self.model_pt_path.endswith(".onnx") and onnx_enabled:
-            # self.model = self._load_onnx_model(self.model_pt_path)
-            providers = (
-                ["CUDAExecutionProvider", "CPUExecutionProvider"]
-                if self.map_location == "cuda"
-                else ["CPUExecutionProvider"]
-            )
-
-            # Set the right inference options, we can add more options here depending on what people want
-            sess_options = ort.SessionOptions()
-            sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
-
-            # Start an inference session
-            ort_session = ort.InferenceSession(
-                self.model_pt_path, providers=providers, sess_options=sess_options
-            )
-            self.model = ort_session
+        elif self.model_pt_path.endswith(".onnx") and ONNX_AVAILABLE:
+            self.model = setup_ort_session(self.model_pt_path, self.map_location)
+            logger.info("Succesfully setup ort session")
 
         else:
             raise RuntimeError("No model weights could be loaded")
 
-        optimization_config = os.path.join(model_dir, "compile.json")
-        backend = load_compiler_config(optimization_config)
+        if hasattr(self, "model_yaml_config") and "pt2" in self.model_yaml_config:
+            pt2_backend = self.model_yaml_config["pt2"]
+            valid_backend = check_valid_pt2_backend(pt2_backend)
+        else:
+            valid_backend = False
 
         # PT 2.0 support is opt in
-        if check_pt2_enabled() and backend:
+        if PT2_AVAILABLE and valid_backend:
             # Compilation will delay your model initialization
             try:
                 self.model = torch.compile(
-                    self.model, backend=backend, mode="reduce-overhead"
+                    self.model,
+                    backend=pt2_backend,
                 )
-                logger.info(f"Compiled model with backend {backend}")
-            except:
+                logger.info(f"Compiled model with backend {pt2_backend}")
+            except e:
                 logger.warning(
-                    f"Compiling model model with backend {backend} has failed \n Proceeding without compilation"
+                    f"Compiling model model with backend {pt2_backend} has failed \n Proceeding without compilation"
                 )
+                logger.warning(e)
 
-        elif ipex_enabled:
+        elif IPEX_AVAILABLE:
             self.model = self.model.to(memory_format=torch.channels_last)
             self.model = ipex.optimize(self.model)
+            logger.info(f"Compiled model with ipex")
 
         logger.debug("Model file %s loaded successfully", self.model_pt_path)
 
@@ -245,7 +252,10 @@ def _load_pickled_model(self, model_dir, model_file, model_pt_path):
         model_class = model_class_definitions[0]
         model = model_class()
         if model_pt_path:
-            state_dict = torch.load(model_pt_path, map_location=self.device)
+            map_location = (
+                None if (XLA_AVAILABLE and self.map_location is None) else self.device
+            )
+            state_dict = torch.load(model_pt_path, map_location=map_location)
             model.load_state_dict(state_dict)
         return model
 
diff --git a/ts/torch_handler/distributed/__init__.py b/ts/torch_handler/distributed/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/ts/torch_handler/distributed/base_deepspeed_handler.py b/ts/torch_handler/distributed/base_deepspeed_handler.py
new file mode 100644
index 0000000000..5e26139a2a
--- /dev/null
+++ b/ts/torch_handler/distributed/base_deepspeed_handler.py
@@ -0,0 +1,14 @@
+import os
+from abc import ABC
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+
+class BaseDeepSpeedHandler(BaseHandler, ABC):
+    """
+    Base default DeepSpeed handler.
+    """
+
+    def initialize(self, ctx: Context):
+        self.device = int(os.getenv("LOCAL_RANK", 0))
diff --git a/ts/torch_handler/distributed/base_pippy_handler.py b/ts/torch_handler/distributed/base_pippy_handler.py
new file mode 100644
index 0000000000..bb234a06c1
--- /dev/null
+++ b/ts/torch_handler/distributed/base_pippy_handler.py
@@ -0,0 +1,23 @@
+"""
+Base default handler to load large models using PyTorch Native PiPPy.
+"""
+import os
+from abc import ABC
+
+import torch
+
+from ts.handler_utils.distributed.pt_pippy import initialize_rpc_workers
+from ts.torch_handler.base_handler import BaseHandler
+
+
+class BasePippyHandler(BaseHandler, ABC):
+    """
+    Base default handler to set up rpc workers for PiPPy large model inference
+    """
+
+    def initialize(self, ctx):
+        self.local_rank = int(os.environ["LOCAL_RANK"])
+        self.world_size = int(os.environ["WORLD_SIZE"])
+        n_devs = torch.cuda.device_count()
+        self.device = self.local_rank % n_devs
+        initialize_rpc_workers(self.local_rank, self.world_size, ctx)
diff --git a/ts/torch_handler/request_envelope/kservev2.py b/ts/torch_handler/request_envelope/kservev2.py
index 33e573cfb9..5a88e9497d 100644
--- a/ts/torch_handler/request_envelope/kservev2.py
+++ b/ts/torch_handler/request_envelope/kservev2.py
@@ -4,7 +4,9 @@
 """
 import json
 import logging
+
 import numpy as np
+
 from .base import BaseEnvelope
 
 logger = logging.getLogger(__name__)
@@ -87,7 +89,9 @@ def _batch_from_json(self, rows):
         Joins the instances of a batch of JSON objects
         """
         logger.debug("Parse input data %s", rows)
-        body_list = [body_list.get("data") or body_list.get("body") for body_list in rows]
+        body_list = [
+            body_list.get("data") or body_list.get("body") for body_list in rows
+        ]
         data_list = self._from_json(body_list)
         return data_list
 
@@ -99,7 +103,15 @@ def _from_json(self, body_list):
         if isinstance(body_list[0], (bytes, bytearray)):
             body_list = [json.loads(body.decode()) for body in body_list]
             logger.debug("Bytes array is %s", body_list)
-        if "id" in body_list[0]:
+
+        input_names = []
+        for index, input in enumerate(body_list[0]["inputs"]):
+            if input["datatype"] == "BYTES":
+                body_list[0]["inputs"][index]["data"] = input["data"][0]
+            input_names.append(input["name"])
+        setattr(self.context, "input_names", input_names)
+        logger.debug("Bytes array is %s", body_list)
+        if body_list[0].get("id") is not None:
             setattr(self.context, "input_request_id", body_list[0]["id"])
         data_list = [inputs_list.get("inputs") for inputs_list in body_list][0]
         return data_list
@@ -116,7 +128,7 @@ def format_output(self, data):
           "model_name": "bert",
           "model_version": "1",
           "outputs": [{
-            "name": "predict",
+            "name": "input-0",
             "shape": [1],
             "datatype": "INT64",
             "data": [2]
@@ -131,10 +143,10 @@ def format_output(self, data):
             delattr(self.context, "input_request_id")
         else:
             response["id"] = self.context.get_request_id(0)
-        response["model_name"] = self.context.manifest.get("model").get(
-            "modelName")
+        response["model_name"] = self.context.manifest.get("model").get("modelName")
         response["model_version"] = self.context.manifest.get("model").get(
-            "modelVersion")
+            "modelVersion"
+        )
         response["outputs"] = self._batch_to_json(data)
         return [response]
 
@@ -143,18 +155,19 @@ def _batch_to_json(self, data):
         Splits batch output to json objects
         """
         output = []
-        for item in data:
-            output.append(self._to_json(item))
+        input_names = getattr(self.context, "input_names")
+        delattr(self.context, "input_names")
+        for index, item in enumerate(data):
+            output.append(self._to_json(item, input_names[index]))
         return output
 
-    def _to_json(self, data):
+    def _to_json(self, data, input_name):
         """
         Constructs JSON object from data
         """
         output_data = {}
         data_ndarray = np.array(data)
-        output_data["name"] = ("explain" if self.context.get_request_header(
-            0, "explain") == "True" else "predict")
+        output_data["name"] = input_name
         output_data["shape"] = list(data_ndarray.shape)
         output_data["datatype"] = _to_datatype(data_ndarray.dtype)
         output_data["data"] = data_ndarray.flatten().tolist()
diff --git a/ts/torch_handler/unit_tests/conftest.py b/ts/torch_handler/unit_tests/conftest.py
new file mode 100644
index 0000000000..a706adc53a
--- /dev/null
+++ b/ts/torch_handler/unit_tests/conftest.py
@@ -0,0 +1,34 @@
+import shutil
+import sys
+from pathlib import Path
+
+import pytest
+
+from .models.base_model import save_pt_file
+from .test_utils.mock_context import MockContext
+
+
+@pytest.fixture()
+def base_model_dir(tmp_path_factory):
+    model_dir = tmp_path_factory.mktemp("base_model_dir")
+
+    shutil.copyfile(
+        Path(__file__).parents[0] / "models" / "base_model.py", model_dir / "model.py"
+    )
+
+    save_pt_file(model_dir.joinpath("model.pt").as_posix())
+
+    sys.path.append(model_dir.as_posix())
+    yield model_dir
+    sys.path.pop()
+
+
+@pytest.fixture()
+def base_model_context(base_model_dir):
+
+    context = MockContext(
+        model_name="mnist",
+        model_dir=base_model_dir.as_posix(),
+        model_file="model.py",
+    )
+    yield context
diff --git a/ts/torch_handler/unit_tests/models/base_model.py b/ts/torch_handler/unit_tests/models/base_model.py
index 95c9d67a2f..80da5a7570 100644
--- a/ts/torch_handler/unit_tests/models/base_model.py
+++ b/ts/torch_handler/unit_tests/models/base_model.py
@@ -6,10 +6,16 @@
 
 import torch
 
+
 class ArgmaxModel(torch.nn.Module):
     def forward(self, *input):
         return torch.argmax(input[0], 1)
 
-if __name__ == '__main__':
+
+def save_pt_file(filepath="base_model.pt"):
     model = ArgmaxModel()
-    torch.save(model.state_dict(), 'base_model.pt')
+    torch.save(model.state_dict(), filepath)
+
+
+if __name__ == "__main__":
+    save_pt_file()
diff --git a/ts/torch_handler/unit_tests/run_unit_tests.sh b/ts/torch_handler/unit_tests/run_unit_tests.sh
deleted file mode 100755
index e2d339dca7..0000000000
--- a/ts/torch_handler/unit_tests/run_unit_tests.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-TEST_DIR=./ts/torch_handler/unit_tests
-case $PWD/ in
-  *ts/torch_handler/unit_tests/) echo "Running tests";;
-  *) echo "Error! Must start in unit_tests directory"; exit 1;;
-esac
-
-cd ../../../
-
-test_image_classifier () {
-  mkdir -p $TEST_DIR/models/tmp
-  wget -nc -q -O \
-    $TEST_DIR/models/tmp/model.pt \
-    https://download.pytorch.org/models/resnet152-b121ed2d.pth
-
-  cp -r examples/image_classifier/resnet_152_batch/* $TEST_DIR/models/tmp
-  python -m pytest $TEST_DIR/test_image_classifier.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-test_mnist_classifier () {
-  mkdir -p $TEST_DIR/models/tmp
-
-  cp -r examples/image_classifier/mnist/* $TEST_DIR/models/tmp
-  python -m pytest $TEST_DIR/test_mnist_kf.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-
-
-test_image_segmenter () {
-  mkdir -p $TEST_DIR/models/tmp
-  wget -nc -q -O \
-    $TEST_DIR/models/tmp/model.pt \
-    https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pth
-  cp -r examples/image_segmenter/fcn/* $TEST_DIR/models/tmp
-  python -m pytest $TEST_DIR/test_image_segmenter.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-test_base_handler () {
-  mkdir -p $TEST_DIR/models/tmp
-  python $TEST_DIR/models/base_model.py
-  mv base_model.pt $TEST_DIR/models/tmp/model.pt
-  cp $TEST_DIR/models/base_model.py $TEST_DIR/models/tmp/model.py
-  python -m pytest $TEST_DIR/test_base_handler.py
-  python -m pytest $TEST_DIR/test_envelopes.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-test_envelope () {
-  mkdir -p $TEST_DIR/models/tmp
-  python $TEST_DIR/models/base_model.py
-  mv base_model.pt $TEST_DIR/models/tmp/model.pt
-  cp $TEST_DIR/models/base_model.py $TEST_DIR/models/tmp/model.py
-  python -m pytest $TEST_DIR/test_envelopes.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-test_object_detector () {
-  mkdir -p $TEST_DIR/models/tmp
-  wget -nc -q -O \
-    $TEST_DIR/models/tmp/model.pt \
-    https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
-  cp -r examples/object_detector/fast-rcnn/* $TEST_DIR/models/tmp
-  python -m pytest $TEST_DIR/test_object_detector.py
-  rm -rf $TEST_DIR/models/tmp
-}
-
-test_base_handler
-test_envelope
-test_image_classifier
-test_image_segmenter
-test_object_detector
-test_mnist_classifier
diff --git a/ts/torch_handler/unit_tests/test_base_handler.py b/ts/torch_handler/unit_tests/test_base_handler.py
index 08dc222831..913864a229 100644
--- a/ts/torch_handler/unit_tests/test_base_handler.py
+++ b/ts/torch_handler/unit_tests/test_base_handler.py
@@ -5,34 +5,28 @@
 Ensures it can load and execute an example model
 """
 
-import sys
 import pytest
+
 from ts.torch_handler.base_handler import BaseHandler
-from .test_utils.mock_context import MockContext
 
-sys.path.append('ts/torch_handler/unit_tests/models/tmp')
 
 @pytest.fixture()
-def model_context():
-    return MockContext()
-
-def test_initialize(model_context):
+def handler(base_model_context):
     handler = BaseHandler()
-    handler.initialize(model_context)
+    handler.initialize(base_model_context)
 
-    assert(True)
     return handler
 
-def test_single_handle(model_context):
-    handler = test_initialize(model_context)
+
+def test_single_handle(handler, base_model_context):
     list_data = [[1.0, 2.0]]
-    processed = handler.handle(list_data, model_context)
+    processed = handler.handle(list_data, base_model_context)
+
+    assert processed == [1]
 
-    assert(processed == [1])
 
-def test_batch_handle(model_context):
-    handler = test_initialize(model_context)
+def test_batch_handle(handler, base_model_context):
     list_data = [[1.0, 2.0], [4.0, 3.0]]
-    processed = handler.handle(list_data, model_context)
+    processed = handler.handle(list_data, base_model_context)
 
-    assert(processed == [1, 0])
+    assert processed == [1, 0]
diff --git a/ts/torch_handler/unit_tests/test_envelopes.py b/ts/torch_handler/unit_tests/test_envelopes.py
index 6094480dff..ca870f398f 100644
--- a/ts/torch_handler/unit_tests/test_envelopes.py
+++ b/ts/torch_handler/unit_tests/test_envelopes.py
@@ -5,79 +5,68 @@
 Ensures it can load and execute an example model
 """
 
-import sys
 import pytest
+
 from ts.torch_handler.base_handler import BaseHandler
 from ts.torch_handler.request_envelope.body import BodyEnvelope
 from ts.torch_handler.request_envelope.json import JSONEnvelope
-from .test_utils.mock_context import MockContext
-
-sys.path.append('ts/torch_handler/unit_tests/models/tmp')
 
-@pytest.fixture()
-def model_context():
-    return MockContext()
 
 @pytest.fixture()
-def handle_fn():
-    ctx = MockContext()
+def handle_fn(base_model_context):
     handler = BaseHandler()
-    handler.initialize(ctx)
+    handler.initialize(base_model_context)
 
     return handler.handle
 
-def test_json(handle_fn, model_context):
-    test_data = [{'body':{
-        'instances': [[1.0, 2.0]]
-    }}]
+
+def test_json(handle_fn, base_model_context):
+    test_data = [{"body": {"instances": [[1.0, 2.0]]}}]
     expected_result = ['{"predictions": [1]}']
 
     envelope = JSONEnvelope(handle_fn)
-    results = envelope.handle(test_data, model_context)
-    assert(results == expected_result)
+    results = envelope.handle(test_data, base_model_context)
+    assert results == expected_result
+
 
-def test_json_batch(handle_fn, model_context):
-    test_data = [{'body':{
-        'instances': [[1.0, 2.0], [4.0, 3.0]]
-    }}]
+def test_json_batch(handle_fn, base_model_context):
+    test_data = [{"body": {"instances": [[1.0, 2.0], [4.0, 3.0]]}}]
     expected_result = ['{"predictions": [1, 0]}']
 
     envelope = JSONEnvelope(handle_fn)
-    results = envelope.handle(test_data, model_context)
-    assert(results == expected_result)
+    results = envelope.handle(test_data, base_model_context)
+    assert results == expected_result
 
-def test_json_double_batch(handle_fn, model_context):
+
+def test_json_double_batch(handle_fn, base_model_context):
     """
     More complex test case. Makes sure the model can
     mux several batches and return the demuxed results
     """
     test_data = [
-        {'body':{'instances': [[1.0, 2.0]]}},
-        {'body':{'instances': [[4.0, 3.0], [5.0, 6.0]]}}
-
+        {"body": {"instances": [[1.0, 2.0]]}},
+        {"body": {"instances": [[4.0, 3.0], [5.0, 6.0]]}},
     ]
     expected_result = ['{"predictions": [1]}', '{"predictions": [0, 1]}']
 
     envelope = JSONEnvelope(handle_fn)
-    results = envelope.handle(test_data, model_context)
+    results = envelope.handle(test_data, base_model_context)
     print(results)
-    assert(results == expected_result)
+    assert results == expected_result
 
-def test_body(handle_fn, model_context):
-    test_data = [{
-        'body':[1.0, 2.0]
-    }]
+
+def test_body(handle_fn, base_model_context):
+    test_data = [{"body": [1.0, 2.0]}]
     expected_result = [1]
 
     envelope = BodyEnvelope(handle_fn)
-    results = envelope.handle(test_data, model_context)
-    assert(results == expected_result)
+    results = envelope.handle(test_data, base_model_context)
+    assert results == expected_result
+
 
-def test_binary(model_context):
-    test_data = [{
-        'instances': [{'b64': 'YQ=='}]
-    }]
+def test_binary(base_model_context):
+    test_data = [{"instances": [{"b64": "YQ=="}]}]
 
-    envelope = JSONEnvelope(lambda x, y: [row.decode('utf-8') for row in x])
-    results = envelope.handle(test_data, model_context)
-    assert(results == ['{"predictions": ["a"]}'])
+    envelope = JSONEnvelope(lambda x, y: [row.decode("utf-8") for row in x])
+    results = envelope.handle(test_data, base_model_context)
+    assert results == ['{"predictions": ["a"]}']
diff --git a/ts/torch_handler/unit_tests/test_image_classifier.py b/ts/torch_handler/unit_tests/test_image_classifier.py
index b03a73872f..73d1fe9784 100644
--- a/ts/torch_handler/unit_tests/test_image_classifier.py
+++ b/ts/torch_handler/unit_tests/test_image_classifier.py
@@ -4,42 +4,87 @@
 Basic unit test for ImageClassifier class.
 Ensures it can load and execute an example model
 """
-
 import sys
+from pathlib import Path
+
 import pytest
+from torchvision.models.resnet import ResNet18_Weights
+
 from ts.torch_handler.image_classifier import ImageClassifier
+
 from .test_utils.mock_context import MockContext
+from .test_utils.model_dir import copy_files, download_model
+
+REPO_DIR = Path(__file__).parents[3]
 
-sys.path.append('ts/torch_handler/unit_tests/models/tmp')
 
-@pytest.fixture()
-def model_setup():
-    context = MockContext(model_name="mnist")
-    with open('ts/torch_handler/unit_tests/models/tmp/images/kitten.jpg', 'rb') as fin:
+@pytest.fixture(scope="module")
+def image_bytes():
+    with open(
+        REPO_DIR.joinpath(
+            "examples/image_classifier/resnet_152_batch/images/kitten.jpg"
+        ).as_posix(),
+        "rb",
+    ) as fin:
         image_bytes = fin.read()
-    return (context, image_bytes)
+    yield image_bytes
 
-def test_initialize(model_setup):
-    model_context, _ = model_setup
-    handler = ImageClassifier()
-    handler.initialize(model_context)
 
-    assert(True)
+@pytest.fixture(scope="module")
+def model_name():
+    return "image_classifier"
+
+
+@pytest.fixture(scope="module")
+def model_dir(tmp_path_factory, model_name):
+    model_dir = tmp_path_factory.mktemp("image_classifier_model_dir")
+
+    src_dir = REPO_DIR.joinpath("examples/image_classifier/resnet_18/")
+
+    model_url = ResNet18_Weights.DEFAULT.url
+
+    download_model(model_url, model_dir)
+
+    files = {
+        "model.py": model_name + ".py",
+        "index_to_name.json": "index_to_name.json",
+    }
+
+    copy_files(src_dir, model_dir, files)
+
+    sys.path.append(model_dir.as_posix())
+    yield model_dir
+    sys.path.pop()
+
+
+@pytest.fixture(scope="module")
+def context(model_dir, model_name):
+
+    context = MockContext(
+        model_name="mnist",
+        model_dir=model_dir.as_posix(),
+        model_file=model_name + ".py",
+    )
+    yield context
+
+
+@pytest.fixture(scope="module")
+def handler(context):
+    handler = ImageClassifier()
+    handler.initialize(context)
     return handler
 
-def test_handle(model_setup):
-    context, image_bytes = model_setup
-    handler = test_initialize(model_setup)
-    test_data = [{'data': image_bytes}] * 2
+
+def test_handle(context, image_bytes, handler):
+    test_data = [{"data": image_bytes}] * 2
     results = handler.handle(test_data, context)
-    assert(len(results) == 2)
-    assert('tiger_cat' in results[0])
+    assert len(results) == 2
+    assert "tiger_cat" in results[0]
+
 
-def test_handle_explain(model_setup):
-    context, image_bytes = model_setup
+def test_handle_explain(context, image_bytes, handler):
     context.explain = True
-    handler = test_initialize(model_setup)
-    test_data = [{'data': image_bytes, 'target': 0}] * 2
+    test_data = [{"data": image_bytes, "target": 0}] * 2
     results = handler.handle(test_data, context)
-    assert(len(results) == 2)
-    assert(results[0])
+    assert len(results) == 2
+    assert results[0]
diff --git a/ts/torch_handler/unit_tests/test_image_segmenter.py b/ts/torch_handler/unit_tests/test_image_segmenter.py
index 3d894fdac6..0a4923a208 100644
--- a/ts/torch_handler/unit_tests/test_image_segmenter.py
+++ b/ts/torch_handler/unit_tests/test_image_segmenter.py
@@ -6,31 +6,75 @@
 """
 
 import sys
+from pathlib import Path
+
 import pytest
+
 from ts.torch_handler.image_segmenter import ImageSegmenter
+
 from .test_utils.mock_context import MockContext
+from .test_utils.model_dir import copy_files, download_model
+
+REPO_DIR = Path(__file__).parents[3]
 
-sys.path.append('ts/torch_handler/unit_tests/models/tmp')
 
 @pytest.fixture()
-def model_setup():
-    context = MockContext(model_name="image_segmenter")
-    with open('ts/torch_handler/unit_tests/models/tmp/persons.jpg', 'rb') as fin:
+def image_bytes():
+    with open(REPO_DIR.joinpath("examples/image_segmenter/persons.jpg"), "rb") as fin:
         image_bytes = fin.read()
-    return (context, image_bytes)
+    return image_bytes
+
+
+@pytest.fixture()
+def model_name():
+    return "image_segmenter"
+
+
+@pytest.fixture()
+def model_dir(tmp_path_factory, model_name):
+    model_dir = tmp_path_factory.mktemp("image_segmenter_model_dir")
+
+    src_dir = REPO_DIR.joinpath("examples/image_segmenter/fcn/")
+
+    model_url = "https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pth"
 
-def test_initialize(model_setup):
-    model_context, _ = model_setup
+    download_model(model_url, model_dir)
+
+    files = {
+        "model.py": model_name + ".py",
+        "intermediate_layer_getter.py": "intermediate_layer_getter.py",
+        "fcn.py": "fcn.py",
+    }
+
+    copy_files(src_dir, model_dir, files)
+
+    sys.path.append(model_dir.as_posix())
+    yield model_dir
+    sys.path.pop()
+
+
+@pytest.fixture()
+def context(model_dir, model_name):
+
+    context = MockContext(
+        model_name="mnist",
+        model_dir=model_dir.as_posix(),
+        model_file=model_name + ".py",
+    )
+    yield context
+
+
+@pytest.fixture()
+def handler(context):
     handler = ImageSegmenter()
-    handler.initialize(model_context)
+    handler.initialize(context)
 
-    assert(True)
     return handler
 
-def test_handle(model_setup):
-    context, image_bytes = model_setup
-    handler = test_initialize(model_setup)
-    test_data = [{'data': image_bytes}] * 2
+
+def test_handle(handler, context, image_bytes):
+    test_data = [{"data": image_bytes}] * 2
     results = handler.handle(test_data, context)
-    assert(len(results) == 2)
-    assert(len(results[0]) == 224)
+
+    assert len(results) == 2
+    assert len(results[0]) == 224
diff --git a/ts/torch_handler/unit_tests/test_mnist_kf.py b/ts/torch_handler/unit_tests/test_mnist_kf.py
index 01096bfa39..717a18ce45 100644
--- a/ts/torch_handler/unit_tests/test_mnist_kf.py
+++ b/ts/torch_handler/unit_tests/test_mnist_kf.py
@@ -6,23 +6,22 @@
 """
 
 import io
-import os
+import shutil
 import sys
+from pathlib import Path
 
 import pytest
 import torchvision.transforms as transforms
 from PIL import Image
 
-from ts.torch_handler.request_envelope import kserve, kservev2
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../", "examples"))
 from examples.image_classifier.mnist.mnist_handler import (
     MNISTDigitClassifier as MNISTClassifier,
 )
+from ts.torch_handler.request_envelope import kserve, kservev2
 
 from .test_utils.mock_context import MockContext
 
-sys.path.append("ts/torch_handler/unit_tests/models/tmp")
+REPO_DIR = Path(__file__).parents[3]
 
 
 image_processing = transforms.Compose(
@@ -30,66 +29,81 @@
 )
 
 
-@pytest.fixture()
-def model_setup():
-    context = MockContext(model_pt_file="mnist_cnn.pt", model_file="mnist.py")
-    with open("ts/torch_handler/unit_tests/models/tmp/test_data/0.png", "rb") as fin:
+@pytest.fixture(scope="module")
+def image_bytes():
+
+    with open(
+        REPO_DIR.joinpath("examples/image_classifier/mnist/test_data/0.png"), "rb"
+    ) as fin:
         image_bytes = fin.read()
-    return (context, image_bytes)
+    return image_bytes
+
+
+@pytest.fixture(scope="module")
+def context(tmp_path_factory):
+    model_dir = tmp_path_factory.mktemp("model_dir")
+
+    shutil.copytree(
+        REPO_DIR.joinpath("examples/image_classifier/mnist/"),
+        model_dir,
+        dirs_exist_ok=True,
+    )
 
+    context = MockContext(
+        model_name="mnist",
+        model_dir=model_dir.as_posix(),
+        model_file="mnist.py",
+        model_pt_file="mnist_cnn.pt",
+    )
 
-def test_initialize(model_setup):
-    model_context, _ = model_setup
+    sys.path.append(model_dir.as_posix())
+    yield context
+    sys.path.pop()
+
+
+@pytest.fixture(scope="module")
+def handler(context):
     handler = MNISTClassifier()
-    handler.initialize(model_context)
-    assert True
+    handler.initialize(context)
     return handler
 
 
-def test_handle(model_setup):
-    context, bytes_array = model_setup
-    handler = test_initialize(model_setup)
-    test_data = [{"data": bytes_array}]
-    # testing for predict API
-    results = handler.handle(test_data, context)
-    assert results[0] in range(0, 9)
+@pytest.fixture()
+def envelope_kf(context):
+    handler = MNISTClassifier()
+    handler.initialize(context)
+    envelope = kserve.KServeEnvelope(handler.handle)
+    return envelope
 
 
-def test_initialize_kf(model_setup):
-    model_context, _ = model_setup
+@pytest.fixture()
+def envelope_kfv2(context):
     handler = MNISTClassifier()
-    handler.initialize(model_context)
-    envelope = kserve.KServeEnvelope(handler.handle)
-    assert True
+    handler.initialize(context)
+    envelope = kservev2.KServev2Envelope(handler.handle)
     return envelope
 
 
-def test_handle_kf(model_setup):
-    context, bytes_array = model_setup
-    image = Image.open(io.BytesIO(bytes_array))
+def test_handle(handler, context, image_bytes):
+    test_data = [{"data": image_bytes}]
+    # testing for predict API
+    results = handler.handle(test_data, context)
+    assert results[0] in range(0, 9)
+
+
+def test_handle_kf(envelope_kf, context, image_bytes):
+    image = Image.open(io.BytesIO(image_bytes))
     image_list = image_processing(image).tolist()
-    envelope = test_initialize_kf(model_setup)
     test_data = {"body": {"instances": [{"data": image_list}]}}
 
     # testing for predict API
-    results = envelope.handle([test_data], context)
+    results = envelope_kf.handle([test_data], context)
     assert results[0]["predictions"][0] in range(0, 9)
 
 
-def test_initialize_kfv2(model_setup):
-    model_context, _ = model_setup
-    handler = MNISTClassifier()
-    handler.initialize(model_context)
-    envelope = kservev2.KServev2Envelope(handler.handle)
-    assert True
-    return envelope
-
-
-def test_handle_kfv2(model_setup):
-    context, bytes_array = model_setup
-    image = Image.open(io.BytesIO(bytes_array))
+def test_handle_kfv2(envelope_kfv2, context, image_bytes):
+    image = Image.open(io.BytesIO(image_bytes))
     image_list = image_processing(image).tolist()
-    envelope = test_initialize_kfv2(model_setup)
     test_data = {
         "body": {
             "id": "test-id",
@@ -105,6 +119,6 @@ def test_handle_kfv2(model_setup):
     }
 
     # testing for v2predict API
-    results = envelope.handle([test_data], context)
+    results = envelope_kfv2.handle([test_data], context)
     print(results)
     assert results[0]["outputs"][0]["data"][0] in range(0, 9)
diff --git a/ts/torch_handler/unit_tests/test_object_detector.py b/ts/torch_handler/unit_tests/test_object_detector.py
index 2b7bf8d47d..290643f5ab 100644
--- a/ts/torch_handler/unit_tests/test_object_detector.py
+++ b/ts/torch_handler/unit_tests/test_object_detector.py
@@ -6,31 +6,75 @@
 """
 
 import sys
+from pathlib import Path
+
 import pytest
+
 from ts.torch_handler.object_detector import ObjectDetector
+
 from .test_utils.mock_context import MockContext
+from .test_utils.model_dir import copy_files, download_model
 
-sys.path.append('ts/torch_handler/unit_tests/models/tmp')
+REPO_DIR = Path(__file__).parents[3]
 
-@pytest.fixture()
-def model_setup():
-    context = MockContext(model_name="object_detector")
-    with open('ts/torch_handler/unit_tests/models/tmp/persons.jpg', 'rb') as fin:
+
+@pytest.fixture(scope="module")
+def image_bytes():
+    with open(REPO_DIR.joinpath("examples/image_segmenter/persons.jpg"), "rb") as fin:
         image_bytes = fin.read()
-    return (context, image_bytes)
+    return image_bytes
+
+
+@pytest.fixture(scope="module")
+def model_name():
+    return "object_detector"
+
+
+@pytest.fixture(scope="module")
+def model_dir(tmp_path_factory, model_name):
+    model_dir = tmp_path_factory.mktemp("model_dir")
+
+    src_dir = REPO_DIR.joinpath("examples/object_detector/fast-rcnn/")
+
+    model_url = (
+        "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth"
+    )
 
-def test_initialize(model_setup):
-    model_context, _ = model_setup
+    download_model(model_url, model_dir)
+
+    files = {
+        "model.py": model_name + ".py",
+        "index_to_name.json": "index_to_name.json",
+    }
+
+    copy_files(src_dir, model_dir, files)
+
+    sys.path.append(model_dir.as_posix())
+    yield model_dir
+    sys.path.pop()
+
+
+@pytest.fixture(scope="module")
+def context(model_dir, model_name):
+
+    context = MockContext(
+        model_name="mnist",
+        model_dir=model_dir.as_posix(),
+        model_file=model_name + ".py",
+    )
+    yield context
+
+
+@pytest.fixture(scope="module")
+def handler(context):
     handler = ObjectDetector()
-    handler.initialize(model_context)
+    handler.initialize(context)
 
-    assert(True)
     return handler
 
-def test_handle(model_setup):
-    context, image_bytes = model_setup
-    handler = test_initialize(model_setup)
-    test_data = [{'data': image_bytes}] * 2
+
+def test_handle(handler, context, image_bytes):
+    test_data = [{"data": image_bytes}] * 2
     results = handler.handle(test_data, context)
-    assert(len(results) == 2)
-    assert(any('bench' in d for d in results[0]))
+    assert len(results) == 2
+    assert any("bench" in d for d in results[0])
diff --git a/ts/torch_handler/unit_tests/test_utils/model_dir.py b/ts/torch_handler/unit_tests/test_utils/model_dir.py
new file mode 100644
index 0000000000..aa4db321e8
--- /dev/null
+++ b/ts/torch_handler/unit_tests/test_utils/model_dir.py
@@ -0,0 +1,30 @@
+import shutil
+from pathlib import Path
+from typing import Dict
+from urllib import request
+from urllib.parse import urlparse
+
+REPO_DIR = Path(__file__).parents[4]
+
+
+def download_model(model_url: str, model_dir: Path) -> None:
+    cache_dir = REPO_DIR / ".cache"
+
+    if not cache_dir.exists():
+        cache_dir.mkdir()
+
+    parts = urlparse(model_url)
+
+    filename = Path(parts.path).name
+
+    if not cache_dir.joinpath(filename).exists():
+        with request.urlopen(model_url) as fin:
+            with open(cache_dir / filename, "wb") as fout:
+                fout.write(fin.read())
+
+    shutil.copy(cache_dir / filename, model_dir / "model.pt")
+
+
+def copy_files(src_dir: Path, dst_dir: Path, files: Dict) -> None:
+    for src_file, dst_file in files.items():
+        shutil.copyfile(src_dir / src_file, dst_dir / dst_file)
diff --git a/ts/torch_handler/vision_handler.py b/ts/torch_handler/vision_handler.py
index 4a8dcffecd..0ad08af327 100644
--- a/ts/torch_handler/vision_handler.py
+++ b/ts/torch_handler/vision_handler.py
@@ -3,12 +3,14 @@
 """
 Base module for all vision handlers
 """
-from abc import ABC
-import io
 import base64
+import io
+from abc import ABC
+
 import torch
-from PIL import Image
 from captum.attr import IntegratedGradients
+from PIL import Image
+
 from .base_handler import BaseHandler
 
 
@@ -16,6 +18,7 @@ class VisionHandler(BaseHandler, ABC):
     """
     Base class for all vision handlers
     """
+
     def initialize(self, context):
         super().initialize(context)
         self.ig = IntegratedGradients(self.model)
diff --git a/ts/utils/__init__.py b/ts/utils/__init__.py
index fb83331a7e..01d3db1a0d 100644
--- a/ts/utils/__init__.py
+++ b/ts/utils/__init__.py
@@ -1,7 +1,3 @@
-
-
 """
 Util files for TorchServe
 """
-
-from . import timeit_decorator
diff --git a/ts/utils/util.py b/ts/utils/util.py
index 629f274008..b636beb8c4 100644
--- a/ts/utils/util.py
+++ b/ts/utils/util.py
@@ -9,6 +9,8 @@
 import os
 import re
 
+import yaml
+
 
 class PT2Backend(str, enum.Enum):
     EAGER = "eager"
@@ -21,6 +23,7 @@ class PT2Backend(str, enum.Enum):
     FX2TRT = "fx2trt"
     ONNXRT = "onnxrt"
     IPEX = "ipex"
+    TORCHXLA_TRACE_ONCE = "torchxla_trace_once"
 
 
 logger = logging.getLogger(__name__)
@@ -53,24 +56,13 @@ def list_classes_from_module(module, parent_class=None):
     return classes
 
 
-def load_compiler_config(config_file_path):
-    """
-    Load a compiler {compiler_name -> compiler }
-    Can be extended to also support kwargs for ONNX and TensorRT
-    """
-    if not os.path.isfile(config_file_path):
-        logger.info(f"{config_file_path} is missing. PT 2.0 will not be used")
-        return None
-
-    with open(config_file_path) as f:
-        mapping = json.load(f)
-
+def check_valid_pt2_backend(backend: str) -> bool:
     backend_values = [member.value for member in PT2Backend]
-    if mapping["pt2"] in backend_values:
-        return mapping["pt2"]
+    if backend in backend_values:
+        return True
     else:
-        logger.warning(f"{mapping['pt2']} is not a supported backend")
-    return None
+        logger.warning(f"{backend} is not a supported backend")
+    return False
 
 
 def load_label_mapping(mapping_file_path):
@@ -136,6 +128,13 @@ def map_class_to_label(probs, mapping=None, lbl_classes=None):
     return results
 
 
+def get_yaml_config(yaml_file_path):
+    config = {}
+    with open(yaml_file_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+
+
 class PredictionException(Exception):
     def __init__(self, message, error_code=500):
         self.message = message
diff --git a/ts/version.txt b/ts/version.txt
index 39e898a4f9..a3df0a6959 100644
--- a/ts/version.txt
+++ b/ts/version.txt
@@ -1 +1 @@
-0.7.1
+0.8.0
diff --git a/ts_scripts/api_utils.py b/ts_scripts/api_utils.py
index a19c9f1799..99398ef17c 100755
--- a/ts_scripts/api_utils.py
+++ b/ts_scripts/api_utils.py
@@ -16,13 +16,12 @@
 ### Torchserve
 ARTIFACTS_MANAGEMENT_DIR = os.path.join("artifacts", "management")
 ARTIFACTS_INFERENCE_DIR = os.path.join("artifacts", "inference")
-ARTIFACTS_WORKFLOW_MANAGEMENT_DIR = os.path.join("artifacts",
-                                                 "workflow_management")
-ARTIFACTS_WORKFLOW_INFERENCE_DIR = os.path.join("artifacts",
-                                                "workflow_inference")
+ARTIFACTS_WORKFLOW_MANAGEMENT_DIR = os.path.join("artifacts", "workflow_management")
+ARTIFACTS_WORKFLOW_INFERENCE_DIR = os.path.join("artifacts", "workflow_inference")
 ARTIFACTS_EXPLANATION_DIR = os.path.join("artifacts", "explanation")
 ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR = os.path.join(
-    "artifacts", "increased_timeout_inference")
+    "artifacts", "increased_timeout_inference"
+)
 ARTIFACTS_HTTPS_DIR = os.path.join("artifacts", "https")
 
 TS_CONSOLE_LOG_FILE = os.path.join("ts_console.log")
@@ -30,25 +29,35 @@
 
 POSTMAN_ENV_FILE = os.path.join("postman", "environment.json")
 POSTMAN_INFERENCE_DATA_FILE = os.path.join("postman", "inference_data.json")
+POSTMAN_LARGE_MODEL_INFERENCE_DATA_FILE = os.path.join(
+    "postman", "large_model_inference_data.json"
+)
 POSTMAN_EXPLANATION_DATA_FILE = os.path.join("postman", "explanation_data.json")
 POSTMAN_MANAGEMENT_DATA_FILE = os.path.join("postman", "management_data.json")
 POSTMAN_WORKFLOW_DATA_FILE = os.path.join("postman", "workflow_data.json")
 POSTMAN_WORKFLOW_INFERENCE_DATA_FILE = os.path.join(
-    "postman", "workflow_inference_data.json")
+    "postman", "workflow_inference_data.json"
+)
 POSTMAN_INCRSD_TIMEOUT_INFERENCE_DATA_FILE = os.path.join(
-    "postman", "increased_timeout_inference.json")
+    "postman", "increased_timeout_inference.json"
+)
 
-#only one management collection for both KServe and torchserve
+# only one management collection for both KServe and torchserve
 POSTMAN_COLLECTION_MANAGEMENT = os.path.join(
-    "postman", "management_api_test_collection.json")
+    "postman", "management_api_test_collection.json"
+)
 POSTMAN_COLLECTION_INFERENCE = os.path.join(
-    "postman", "inference_api_test_collection.json")
-POSTMAN_COLLECTION_WORKFLOW = os.path.join("postman",
-                                           "workflow_api_test_collection.json")
+    "postman", "inference_api_test_collection.json"
+)
+POSTMAN_COLLECTION_WORKFLOW = os.path.join(
+    "postman", "workflow_api_test_collection.json"
+)
 POSTMAN_COLLECTION_WORKFLOW_INFERENCE = os.path.join(
-    "postman", "workflow_inference_collection.json")
+    "postman", "workflow_inference_collection.json"
+)
 POSTMAN_COLLECTION_EXPLANATION = os.path.join(
-    "postman", "explanation_api_test_collection.json")
+    "postman", "explanation_api_test_collection.json"
+)
 
 POSTMAN_COLLECTION_HTTPS = os.path.join("postman", "https_test_collection.json")
 
@@ -56,38 +65,39 @@
 ARTIFACTS_MANAGEMENT_DIR_KF = os.path.join("artifacts", "management_kf")
 ARTIFACTS_INFERENCE_DIR_KF = os.path.join("artifacts", "inference_kf")
 ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KF = os.path.join(
-    "artifacts", "increased_timeout_inference_kf")
+    "artifacts", "increased_timeout_inference_kf"
+)
 ARTIFACTS_HTTPS_DIR_KF = os.path.join("artifacts", "https_kf")
 
 TS_CONFIG_FILE_HTTPS_KF = os.path.join("resources", "config_kf.properties")
 
-POSTMAN_INFERENCE_DATA_FILE_KF = os.path.join("postman",
-                                              "kf_inference_data.json")
+POSTMAN_INFERENCE_DATA_FILE_KF = os.path.join("postman", "kf_inference_data.json")
 POSTMAN_INCRSD_TIMEOUT_INFERENCE_DATA_FILE_KF = os.path.join(
-    "postman", "increased_timeout_inference.json")
+    "postman", "increased_timeout_inference.json"
+)
 
-POSTMAN_COLLECTION_INFERENCE_KF = os.path.join("postman",
-                                               "kf_api_test_collection.json")
+POSTMAN_COLLECTION_INFERENCE_KF = os.path.join("postman", "kf_api_test_collection.json")
 
-POSTMAN_COLLECTION_HTTPS_KF = os.path.join("postman",
-                                           "kf_https_test_collection.json")
+POSTMAN_COLLECTION_HTTPS_KF = os.path.join("postman", "kf_https_test_collection.json")
 
 ### KServe v2 protocol
 ARTIFACTS_INFERENCE_DIR_KFV2 = os.path.join("artifacts", "inference_kfv2")
 ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KFV2 = os.path.join(
-    "artifacts", "increased_timeout_inference_kfv2")
+    "artifacts", "increased_timeout_inference_kfv2"
+)
 ARTIFACTS_HTTPS_DIR_KFV2 = os.path.join("artifacts", "https_kfv2")
 
 TS_CONFIG_FILE_HTTPS_KFV2 = os.path.join("resources", "config_kfv2.properties")
 
-POSTMAN_INFERENCE_DATA_FILE_KFV2 = os.path.join("postman",
-                                                "kfv2_inference_data.json")
+POSTMAN_INFERENCE_DATA_FILE_KFV2 = os.path.join("postman", "kfv2_inference_data.json")
 
 POSTMAN_COLLECTION_INFERENCE_KFV2 = os.path.join(
-    "postman", "kfv2_api_test_collection.json")
+    "postman", "kfv2_api_test_collection.json"
+)
 
-POSTMAN_COLLECTION_HTTPS_KFV2 = os.path.join("postman",
-                                             "kfv2_https_test_collection.json")
+POSTMAN_COLLECTION_HTTPS_KFV2 = os.path.join(
+    "postman", "kfv2_https_test_collection.json"
+)
 
 REPORT_FILE = os.path.join("report.html")
 
@@ -105,10 +115,10 @@ def move_logs(log_file, artifact_dir):
 
 
 def trigger_management_tests():
-    """ Return exit code of newman execution of management collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of management collection"""
+    ts.start_torchserve(
+        ncs=True, model_store=MODEL_STORE_DIR, log_file=TS_CONSOLE_LOG_FILE
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_MANAGEMENT} -d {POSTMAN_MANAGEMENT_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_MANAGEMENT_DIR}/{REPORT_FILE} --verbose"
     )
@@ -119,25 +129,35 @@ def trigger_management_tests():
 
 
 def trigger_inference_tests():
-    """ Return exit code of newman execution of inference collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of inference collection"""
+    config_file = open("config.properties", "w")
+    config_file.write("metrics_mode=prometheus")
+    config_file.close()
+
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_INFERENCE} -d {POSTMAN_INFERENCE_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INFERENCE_DIR}/{REPORT_FILE} --verbose"
     )
     ts.stop_torchserve()
     move_logs(TS_CONSOLE_LOG_FILE, ARTIFACTS_INFERENCE_DIR)
     cleanup_model_store()
+    os.remove("config.properties")
     return EXIT_CODE
 
 
 def trigger_workflow_tests():
-    """ Return exit code of newman execution of workflow collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        workflow_store=MODEL_STORE_DIR,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of workflow collection"""
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        workflow_store=MODEL_STORE_DIR,
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_WORKFLOW} -d {POSTMAN_WORKFLOW_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_WORKFLOW_MANAGEMENT_DIR}/{REPORT_FILE} --verbose"
     )
@@ -148,11 +168,13 @@ def trigger_workflow_tests():
 
 
 def trigger_workflow_inference_tests():
-    """ Return exit code of newman execution of workflow inference collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        workflow_store=MODEL_STORE_DIR,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of workflow inference collection"""
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        workflow_store=MODEL_STORE_DIR,
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_WORKFLOW_INFERENCE} -d {POSTMAN_WORKFLOW_INFERENCE_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_WORKFLOW_INFERENCE_DIR}/{REPORT_FILE} --verbose"
     )
@@ -163,32 +185,42 @@ def trigger_workflow_inference_tests():
 
 
 def trigger_explanation_tests():
-    """ Return exit code of newman execution of inference collection """
+    """Return exit code of newman execution of inference collection"""
+    config_file = open("config.properties", "w")
+    config_file.write("metrics_mode=prometheus")
+    config_file.close()
 
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_EXPLANATION} -d {POSTMAN_EXPLANATION_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INFERENCE_DIR}/{REPORT_FILE} --verbose"
     )
     ts.stop_torchserve()
     move_logs(TS_CONSOLE_LOG_FILE, ARTIFACTS_EXPLANATION_DIR)
     cleanup_model_store()
+    os.remove("config.properties")
     return EXIT_CODE
 
 
 def trigger_incr_timeout_inference_tests():
-    """ Return exit code of newman execution of increased timeout inference collection """
+    """Return exit code of newman execution of increased timeout inference collection"""
 
     # Configuration with increased timeout
     config_file = open("config.properties", "w")
-    config_file.write("default_response_timeout=300")
+    config_file.write("default_response_timeout=300\n")
+    config_file.write("metrics_mode=prometheus")
     config_file.close()
 
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file="config.properties",
-                        log_file=TS_CONSOLE_LOG_FILE)
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_INFERENCE} -d {POSTMAN_INCRSD_TIMEOUT_INFERENCE_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR}/{REPORT_FILE} --verbose"
     )
@@ -201,11 +233,13 @@ def trigger_incr_timeout_inference_tests():
 
 
 def trigger_https_tests():
-    """ Return exit code of newman execution of https collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file=TS_CONFIG_FILE_HTTPS,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of https collection"""
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file=TS_CONFIG_FILE_HTTPS,
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run --insecure -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_HTTPS} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_HTTPS_DIR}/{REPORT_FILE} --verbose"
     )
@@ -217,16 +251,18 @@ def trigger_https_tests():
 
 ## KServe tests starts here
 def trigger_management_tests_kf():
-    """ Return exit code of newman execution of management collection """
+    """Return exit code of newman execution of management collection"""
 
     config_file = open("config.properties", "w")
     config_file.write("service_envelope=kserve")
     config_file.close()
 
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file="config.properties",
-                        log_file=TS_CONSOLE_LOG_FILE)
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_MANAGEMENT} -d {POSTMAN_MANAGEMENT_DATA_FILE} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_MANAGEMENT_DIR_KF}/{REPORT_FILE} --verbose"
     )
@@ -238,16 +274,19 @@ def trigger_management_tests_kf():
 
 
 def trigger_inference_tests_kf():
-    """ Return exit code of newman execution of inference collection """
+    """Return exit code of newman execution of inference collection"""
 
     config_file = open("config.properties", "w")
-    config_file.write("service_envelope=kserve")
+    config_file.write("service_envelope=kserve\n")
+    config_file.write("metrics_mode=prometheus")
     config_file.close()
 
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file="config.properties",
-                        log_file=TS_CONSOLE_LOG_FILE)
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_INFERENCE_KF} -d {POSTMAN_INFERENCE_DATA_FILE_KF} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INFERENCE_DIR_KF}/{REPORT_FILE} --verbose"
     )
@@ -259,11 +298,13 @@ def trigger_inference_tests_kf():
 
 
 def trigger_https_tests_kf():
-    """ Return exit code of newman execution of https collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file=TS_CONFIG_FILE_HTTPS_KF,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of https collection"""
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file=TS_CONFIG_FILE_HTTPS_KF,
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run --insecure -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_HTTPS_KF} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_HTTPS_DIR_KF}/{REPORT_FILE} --verbose"
     )
@@ -274,16 +315,19 @@ def trigger_https_tests_kf():
 
 
 def trigger_inference_tests_kfv2():
-    """ Return exit code of newman execution of inference collection """
+    """Return exit code of newman execution of inference collection"""
 
     config_file = open("config.properties", "w")
-    config_file.write("service_envelope=kservev2")
+    config_file.write("service_envelope=kservev2\n")
+    config_file.write("metrics_mode=prometheus")
     config_file.close()
 
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file="config.properties",
-                        log_file=TS_CONSOLE_LOG_FILE)
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file="config.properties",
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_INFERENCE_KFV2} -d {POSTMAN_INFERENCE_DATA_FILE_KFV2} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_INFERENCE_DIR_KFV2}/{REPORT_FILE} --verbose"
     )
@@ -295,11 +339,13 @@ def trigger_inference_tests_kfv2():
 
 
 def trigger_https_tests_kfv2():
-    """ Return exit code of newman execution of https collection """
-    ts.start_torchserve(ncs=True,
-                        model_store=MODEL_STORE_DIR,
-                        config_file=TS_CONFIG_FILE_HTTPS_KFV2,
-                        log_file=TS_CONSOLE_LOG_FILE)
+    """Return exit code of newman execution of https collection"""
+    ts.start_torchserve(
+        ncs=True,
+        model_store=MODEL_STORE_DIR,
+        config_file=TS_CONFIG_FILE_HTTPS_KFV2,
+        log_file=TS_CONSOLE_LOG_FILE,
+    )
     EXIT_CODE = os.system(
         f"newman run --insecure -e {POSTMAN_ENV_FILE} {POSTMAN_COLLECTION_HTTPS_KFV2} -r cli,htmlextra --reporter-htmlextra-export {ARTIFACTS_HTTPS_DIR_KFV2}/{REPORT_FILE} --verbose"
     )
@@ -322,23 +368,47 @@ def trigger_all():
     exit_code10 = trigger_explanation_tests()
     exit_code11 = trigger_workflow_tests()
     exit_code12 = trigger_workflow_inference_tests()
-    return 1 if any(code != 0 for code in [
-        exit_code1, exit_code2, exit_code3, exit_code4, exit_code5, exit_code6,
-        exit_code7, exit_code8, exit_code9, exit_code10, exit_code11,
-        exit_code12
-    ]) else 0
+    return (
+        1
+        if any(
+            code != 0
+            for code in [
+                exit_code1,
+                exit_code2,
+                exit_code3,
+                exit_code4,
+                exit_code5,
+                exit_code6,
+                exit_code7,
+                exit_code8,
+                exit_code9,
+                exit_code10,
+                exit_code11,
+                exit_code12,
+            ]
+        )
+        else 0
+    )
 
 
 def test_api(collection):
     os.chdir(TEST_DIR)
     ALL_DIRS = [
-        MODEL_STORE_DIR, ARTIFACTS_MANAGEMENT_DIR, ARTIFACTS_INFERENCE_DIR,
-        ARTIFACTS_EXPLANATION_DIR, ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR,
-        ARTIFACTS_HTTPS_DIR, ARTIFACTS_MANAGEMENT_DIR_KF,
-        ARTIFACTS_INFERENCE_DIR_KF, ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KF,
-        ARTIFACTS_HTTPS_DIR_KF, ARTIFACTS_INFERENCE_DIR_KFV2,
-        ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KFV2, ARTIFACTS_HTTPS_DIR_KFV2,
-        ARTIFACTS_WORKFLOW_MANAGEMENT_DIR, ARTIFACTS_WORKFLOW_INFERENCE_DIR
+        MODEL_STORE_DIR,
+        ARTIFACTS_MANAGEMENT_DIR,
+        ARTIFACTS_INFERENCE_DIR,
+        ARTIFACTS_EXPLANATION_DIR,
+        ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR,
+        ARTIFACTS_HTTPS_DIR,
+        ARTIFACTS_MANAGEMENT_DIR_KF,
+        ARTIFACTS_INFERENCE_DIR_KF,
+        ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KF,
+        ARTIFACTS_HTTPS_DIR_KF,
+        ARTIFACTS_INFERENCE_DIR_KFV2,
+        ARTIFACTS_INCRSD_TIMEOUT_INFERENCE_DIR_KFV2,
+        ARTIFACTS_HTTPS_DIR_KFV2,
+        ARTIFACTS_WORKFLOW_MANAGEMENT_DIR,
+        ARTIFACTS_WORKFLOW_INFERENCE_DIR,
     ]
 
     for DIR in ALL_DIRS:
@@ -356,7 +426,7 @@ def test_api(collection):
         "https": trigger_https_tests,
         "https_kf": trigger_https_tests_kf,
         "https_kfv2": trigger_https_tests_kfv2,
-        "all": trigger_all
+        "all": trigger_all,
     }
 
     exit_code = switcher[collection]()
diff --git a/ts_scripts/backend_utils.py b/ts_scripts/backend_utils.py
index 756b4ec5fb..e5d0ea6c22 100755
--- a/ts_scripts/backend_utils.py
+++ b/ts_scripts/backend_utils.py
@@ -10,10 +10,11 @@ def test_torchserve():
     # Execute python tests
     print("## Started torchserve pytests")
     test_dir = os.path.join("ts", "tests", "unit_tests")
+    handler_test_dir = os.path.join("ts", "torch_handler", "unit_tests")
     coverage_dir = os.path.join("ts")
     report_output_dir = os.path.join(test_dir, "coverage.xml")
 
-    ts_test_cmd = f"python -m pytest --cov-report xml:{report_output_dir} --cov={coverage_dir} {test_dir}"
+    ts_test_cmd = f"python -m pytest --cov-report xml:{report_output_dir} --cov={coverage_dir} {test_dir} {handler_test_dir}"
     print(f"## In directory: {os.getcwd()} | Executing command: {ts_test_cmd}")
     ts_test_error_code = os.system(ts_test_cmd)
 
diff --git a/ts_scripts/configs/sanity_models.json b/ts_scripts/configs/sanity_models.json
new file mode 100644
index 0000000000..4b9e589fc8
--- /dev/null
+++ b/ts_scripts/configs/sanity_models.json
@@ -0,0 +1,73 @@
+[
+    {
+        "name": "fastrcnn",
+        "inputs": [
+            "examples/object_detector/persons.jpg"
+        ],
+        "handler": "object_detector"
+    },
+    {
+        "name": "fcn_resnet_101",
+        "inputs": [
+            "docs/images/blank_image.jpg",
+            "examples/image_segmenter/persons.jpg"
+        ],
+        "handler": "image_segmenter"
+    },
+    {
+        "name": "my_text_classifier_v4",
+        "inputs": [
+            "examples/text_classification/sample_text.txt"
+        ],
+        "handler": "text_classification"
+    },
+    {
+        "name": "resnet-18",
+        "inputs": [
+            "examples/image_classifier/kitten.jpg"
+        ],
+        "handler": "image_classifier"
+    },
+    {
+        "name": "my_text_classifier_scripted_v3",
+        "inputs": [
+            "examples/text_classification/sample_text.txt"
+        ],
+        "handler": "text_classification"
+    },
+    {
+        "name": "alexnet_scripted",
+        "inputs": [
+            "examples/image_classifier/kitten.jpg"
+        ],
+        "handler": "image_classifier"
+    },
+    {
+        "name": "fcn_resnet_101_scripted",
+        "inputs": [
+            "examples/image_segmenter/persons.jpg"
+        ],
+        "handler": "image_segmenter"
+    },
+    {
+        "name": "distill_bert_qa_eager",
+        "inputs": [
+            "examples/Huggingface_Transformers/QA_artifacts/sample_text.txt"
+        ],
+        "handler": "custom"
+    },
+    {
+        "name": "bert_token_classification_no_torchscript",
+        "inputs": [
+            "examples/Huggingface_Transformers/Token_classification_artifacts/sample_text.txt"
+        ],
+        "handler": "custom"
+    },
+    {
+        "name": "bert_seqc_without_torchscript",
+        "inputs": [
+            "examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        ],
+        "handler": "custom"
+    }
+]
diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py
index 86a7a9755b..b5b9b4b0a7 100644
--- a/ts_scripts/install_dependencies.py
+++ b/ts_scripts/install_dependencies.py
@@ -50,18 +50,22 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly)
             # as it may reinstall the packages with different versions
             os.system("conda install -y conda-build")
 
-        self.install_torch_packages(cuda_version)
+        # Install PyTorch packages
+        if nightly:
+            os.system(
+                f"pip3 install numpy --pre torch torchvision torchtext torchaudio --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/{cuda_version}"
+            )
+        else:
+            self.install_torch_packages(cuda_version)
+
         os.system(f"{sys.executable} -m pip install -U pip setuptools")
         # developer.txt also installs packages from common.txt
         os.system(f"{sys.executable} -m pip install -U -r {requirements_file_path}")
-        # If conda is available install conda-build package
 
-        # TODO: This will run 2 installations for torch but to make this cleaner we should first refactor all of our requirements.txt into just 2 files
-        # And then make torch an optional dependency for the common.txt
-        if nightly:
-            os.system(
-                f"pip3 install numpy --pre torch[dynamo] torchvision torchtext torchaudio --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/{cuda_version}"
-            )
+        # Install dependencies for GPU
+        if not isinstance(cuda_version, type(None)):
+            gpu_requirements_file = os.path.join("requirements", "common_gpu.txt")
+            os.system(f"{sys.executable} -m pip install -U -r {gpu_requirements_file}")
 
     def install_node_packages(self):
         os.system(
@@ -74,6 +78,9 @@ def install_jmeter(self):
     def install_wget(self):
         pass
 
+    def install_numactl(self):
+        pass
+
 
 class Linux(Common):
     def __init__(self):
@@ -99,13 +106,9 @@ def install_wget(self):
         if os.system("wget --version") != 0 or args.force:
             os.system(f"{self.sudo_cmd}apt-get install -y wget")
 
-    def install_libgit2(self):
-        os.system(
-            f"wget https://github.com/libgit2/libgit2/archive/refs/tags/v1.3.0.tar.gz -O libgit2-1.3.0.tar.gz"
-        )
-        os.system(f"tar xzf libgit2-1.3.0.tar.gz")
-        os.system(f"cd libgit2-1.3.0 && cmake . && make && sudo make install && cd ..")
-        os.system(f"rm -rf libgit2-1.3.0 && rm libgit2-1.3.0.tar.gz")
+    def install_numactl(self):
+        if os.system("numactl --show") != 0 or args.force:
+            os.system(f"{self.sudo_cmd}apt-get install -y numactl")
 
 
 class Windows(Common):
@@ -122,6 +125,9 @@ def install_nodejs(self):
     def install_wget(self):
         pass
 
+    def install_numactl(self):
+        pass
+
 
 class Darwin(Common):
     def __init__(self):
@@ -146,6 +152,10 @@ def install_wget(self):
         if os.system("wget --version") != 0 or args.force:
             os.system("brew install wget")
 
+    def install_numactl(self):
+        if os.system("numactl --show") != 0 or args.force:
+            os.system("brew install numactl")
+
 
 def install_dependencies(cuda_version=None, nightly=False):
     os_map = {"Linux": Linux, "Windows": Windows, "Darwin": Darwin}
@@ -155,15 +165,14 @@ def install_dependencies(cuda_version=None, nightly=False):
         system.install_wget()
         system.install_nodejs()
         system.install_node_packages()
-
-    if platform.system() == "Linux" and args.environment == "dev":
-        system.install_libgit2()
+        system.install_numactl()
 
     # Sequence of installation to be maintained
     system.install_java()
-    requirements_file_path = "requirements/" + (
-        "production.txt" if args.environment == "prod" else "developer.txt"
-    )
+
+    requirements_file = "common.txt" if args.environment == "prod" else "developer.txt"
+    requirements_file_path = os.path.join("requirements", requirements_file)
+
     system.install_python_packages(cuda_version, requirements_file_path, nightly)
 
 
@@ -181,7 +190,7 @@ def get_brew_version():
     parser.add_argument(
         "--cuda",
         default=None,
-        choices=["cu92", "cu101", "cu102", "cu111", "cu113", "cu116", "cu117"],
+        choices=["cu92", "cu101", "cu102", "cu111", "cu113", "cu116", "cu117", "cu118"],
         help="CUDA version for torch",
     )
     parser.add_argument(
diff --git a/ts_scripts/install_from_src.py b/ts_scripts/install_from_src.py
index 9c555f9c3d..40f2a9ae82 100644
--- a/ts_scripts/install_from_src.py
+++ b/ts_scripts/install_from_src.py
@@ -27,12 +27,14 @@ def install_from_src(dev=False):
         "--environment",
         type=str,
         default="production",
-        help="options: dev|prod",
+        help="options: dev|production",
+    )
+    parser.add_argument(
+        "--git-branch",
+        type=str,
+        default="main",
     )
     args = parser.parse_args()
     check_python_version()
-    from pygit2 import Repository
-
-    git_branch = Repository(".").head.shorthand
-    build_hdr_printer.main(git_branch)
+    build_hdr_printer.main(args.git_branch)
     install_from_src(args.environment == "dev")
diff --git a/ts_scripts/marsgen.py b/ts_scripts/marsgen.py
index 3f61fa2fc4..dc4d56a374 100644
--- a/ts_scripts/marsgen.py
+++ b/ts_scripts/marsgen.py
@@ -1,10 +1,10 @@
 import argparse
 import json
 import os
-import sys
-import urllib.request
 import shutil
 import subprocess
+import sys
+import urllib.request
 
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
 sys.path.append(REPO_ROOT)
@@ -12,6 +12,7 @@
 os.makedirs(MODEL_STORE_DIR, exist_ok=True)
 MAR_CONFIG_FILE_PATH = os.path.join(REPO_ROOT, "ts_scripts", "mar_config.json")
 
+
 def delete_model_store_gen_dir():
     print(f"## Deleting model_store_gen_dir: {MODEL_STORE_DIR}\n")
     mar_set.clear()
@@ -21,7 +22,10 @@ def delete_model_store_gen_dir():
         except OSError as e:
             print("Error: %s : %s" % (MODEL_STORE_DIR, e.strerror))
 
+
 mar_set = set()
+
+
 def gen_mar(model_store=None):
     print(f"## Starting gen_mar: {model_store}\n")
     if len(mar_set) == 0:
@@ -53,7 +57,9 @@ def generate_mars(mar_config=MAR_CONFIG_FILE_PATH, model_store_dir=MODEL_STORE_D
     - "extra_files": the paths of extra files
     Note: To generate .pt file, "serialized_file_remote" and "gen_scripted_file_path" must be provided
     """
-    print(f"## Starting generate_mars, mar_config:{mar_config}, model_store_dir:{model_store_dir}\n")
+    print(
+        f"## Starting generate_mars, mar_config:{mar_config}, model_store_dir:{model_store_dir}\n"
+    )
     mar_set.clear()
     cwd = os.getcwd()
     os.chdir(REPO_ROOT)
@@ -63,45 +69,51 @@ def generate_mars(mar_config=MAR_CONFIG_FILE_PATH, model_store_dir=MODEL_STORE_D
         for model in models:
             serialized_file_path = None
             if model.get("serialized_file_remote") and model["serialized_file_remote"]:
-                if model.get("gen_scripted_file_path") and model["gen_scripted_file_path"]:
+                if (
+                    model.get("gen_scripted_file_path")
+                    and model["gen_scripted_file_path"]
+                ):
                     subprocess.run(["python", model["gen_scripted_file_path"]])
                 else:
-                    serialized_model_file_url = \
-                        "https://download.pytorch.org/models/{}".format(model["serialized_file_remote"])
+                    serialized_model_file_url = (
+                        "https://download.pytorch.org/models/{}".format(
+                            model["serialized_file_remote"]
+                        )
+                    )
                     urllib.request.urlretrieve(
                         serialized_model_file_url,
-                        f'{model_store_dir}/{model["serialized_file_remote"]}')
-                serialized_file_path = os.path.join(model_store_dir, model["serialized_file_remote"])
+                        f'{model_store_dir}/{model["serialized_file_remote"]}',
+                    )
+                serialized_file_path = os.path.join(
+                    model_store_dir, model["serialized_file_remote"]
+                )
             elif model.get("serialized_file_local") and model["serialized_file_local"]:
                 serialized_file_path = model["serialized_file_local"]
 
-            handler = None
-            if model.get("handler") and model["handler"]:
-                handler = model["handler"]
+            handler = model.get("handler", None)
 
-            extra_files = None
-            if model.get("extra_files") and model["extra_files"]:
-                extra_files = model["extra_files"]
+            extra_files = model.get("extra_files", None)
 
-            runtime = None
-            if model.get("runtime") and model["runtime"]:
-                runtime = model["runtime"]
+            runtime = model.get("runtime", None)
 
-            archive_format = None
-            if model.get("archive_format") and model["archive_format"]:
-                archive_format = model["archive_format"]
+            archive_format = model.get("archive_format", "zip-store")
 
-            requirements_file = None
-            if model.get("requirements_file") and model["requirements_file"]:
-                requirements_file = model["requirements_file"]
+            requirements_file = model.get("requirements_file", None)
 
-            export_path = model_store_dir
-            if model.get("export_path") and model["export_path"]:
-                export_path = model["export_path"]
+            export_path = model.get("export_path", model_store_dir)
 
-            cmd = model_archiver_command_builder(model["model_name"], model["version"], model["model_file"],
-                                                 serialized_file_path, handler, extra_files,
-                                                 runtime, archive_format, requirements_file, export_path)
+            cmd = model_archiver_command_builder(
+                model["model_name"],
+                model["version"],
+                model["model_file"],
+                serialized_file_path,
+                handler,
+                extra_files,
+                runtime,
+                archive_format,
+                requirements_file,
+                export_path,
+            )
             print(f"## In directory: {os.getcwd()} | Executing command: {cmd}\n")
             try:
                 subprocess.check_call(cmd, shell=True)
@@ -109,19 +121,34 @@ def generate_mars(mar_config=MAR_CONFIG_FILE_PATH, model_store_dir=MODEL_STORE_D
                 print("## {} is generated.\n".format(marfile))
                 mar_set.add(marfile)
             except subprocess.CalledProcessError as exc:
-                print("## {} creation failed !, error: {}\n".format(model["model_name"], exc))
-
-            if model.get("serialized_file_remote") and \
-                    model["serialized_file_remote"] and \
-                    os.path.exists(serialized_file_path):
+                print(
+                    "## {} creation failed !, error: {}\n".format(
+                        model["model_name"], exc
+                    )
+                )
+
+            if (
+                model.get("serialized_file_remote")
+                and model["serialized_file_remote"]
+                and os.path.exists(serialized_file_path)
+            ):
                 os.remove(serialized_file_path)
     os.chdir(cwd)
 
 
-def model_archiver_command_builder(model_name=None, version=None, model_file=None,
-                                   serialized_file=None, handler=None, extra_files=None,
-                                   runtime=None, archive_format=None, requirements_file=None,
-                                   export_path=None, force=True):
+def model_archiver_command_builder(
+    model_name=None,
+    version=None,
+    model_file=None,
+    serialized_file=None,
+    handler=None,
+    extra_files=None,
+    runtime=None,
+    archive_format=None,
+    requirements_file=None,
+    export_path=None,
+    force=True,
+):
     cmd = "torch-model-archiver"
 
     if model_name:
@@ -159,14 +186,21 @@ def model_archiver_command_builder(model_name=None, version=None, model_file=Non
 
     return cmd
 
+
 if __name__ == "__main__":
     # cmd:
     # python ts_scripts/marsgen.py
     # python ts_scripts/marsgen.py --config my_mar_config.json
 
     parser = argparse.ArgumentParser(description="Generate model mar files")
-    parser.add_argument('--config', default=MAR_CONFIG_FILE_PATH, help="mar file configuration json file")
-    parser.add_argument('--model-store', default=MODEL_STORE_DIR, help="model store dir")
+    parser.add_argument(
+        "--config",
+        default=MAR_CONFIG_FILE_PATH,
+        help="mar file configuration json file",
+    )
+    parser.add_argument(
+        "--model-store", default=MODEL_STORE_DIR, help="model store dir"
+    )
 
     args = parser.parse_args()
     generate_mars(args.config, MODEL_STORE_DIR)
diff --git a/ts_scripts/sanity_utils.py b/ts_scripts/sanity_utils.py
index 4d0e93120f..db9f297ff6 100755
--- a/ts_scripts/sanity_utils.py
+++ b/ts_scripts/sanity_utils.py
@@ -1,18 +1,23 @@
 import glob
+import json
 import os
+import subprocess
 import sys
+from pathlib import Path
 
-import nvgpu
+import torch
 
 from ts_scripts import marsgen as mg
-
-REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
-sys.path.append(REPO_ROOT)
-
 from ts_scripts import tsutils as ts
 from ts_scripts import utils
 from ts_scripts.tsutils import generate_grpc_client_stubs
 
+REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
+sys.path.append(REPO_ROOT)
+MODELS_CONFIG_FILE_PATH = Path(__file__).parent.joinpath(
+    "configs", "sanity_models.json"
+)
+
 
 def run_markdown_link_checker():
     print("## Started markdown link checker")
@@ -32,6 +37,8 @@ def validate_model_on_gpu():
     # Assumption is -
     # 1. GPUs on test setup are only utlizied by torchserve
     # 2. Models are successfully UNregistered between subsequent calls
+    import nvgpu
+
     model_loaded = False
     for info in nvgpu.gpu_info():
         if info["mem_used"] > 0 and info["mem_used_percent"] > 0.0:
@@ -40,189 +47,151 @@ def validate_model_on_gpu():
     return model_loaded
 
 
-def test_sanity():
-    generate_grpc_client_stubs()
+def load_model_to_validate():
+    with open(MODELS_CONFIG_FILE_PATH) as f:
+        model_list = json.load(f)
+        assert isinstance(model_list, list)
 
-    print("## Started sanity tests")
+    print(model_list)
+    models_to_validate = {}
+    for m in model_list:
+        models_to_validate[m["name"]] = m
 
-    resnet18_model = {
-        "name": "resnet-18",
-        "inputs": ["examples/image_classifier/kitten.jpg"],
-        "handler": "image_classifier",
-    }
+    # models_to_validate = {m["name"]: m for m in model_list}
+    assert len(models_to_validate) == len(
+        model_list
+    ), "Model names are expected to be unique"
+    return models_to_validate
 
-    bert_token_classification_no_torchscript_model = {
-        "name": "bert_token_classification_no_torchscript",
-        "inputs": [
-            "examples/Huggingface_Transformers/Token_classification_artifacts/sample_text.txt"
-        ],
-        "handler": "custom",
-    }
 
-    bert_seqc_without_torchscript_model = {
-        "name": "bert_seqc_without_torchscript",
-        "inputs": [
-            "examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
-        ],
-        "handler": "custom",
-    }
+def test_gpu_setup():
+    is_gpu_instance = utils.is_gpu_instance()
+    if is_gpu_instance:
+        assert torch.cuda.is_available(), "## Ohh its NOT running on GPU !"
 
-    models_to_validate = [
-        {
-            "name": "fastrcnn",
-            "inputs": ["examples/object_detector/persons.jpg"],
-            "handler": "object_detector",
-        },
-        {
-            "name": "fcn_resnet_101",
-            "inputs": [
-                "docs/images/blank_image.jpg",
-                "examples/image_segmenter/persons.jpg",
-            ],
-            "handler": "image_segmenter",
-        },
-        {
-            "name": "my_text_classifier_v4",
-            "inputs": ["examples/text_classification/sample_text.txt"],
-            "handler": "text_classification",
-        },
-        resnet18_model,
-        {
-            "name": "my_text_classifier_scripted_v3",
-            "inputs": ["examples/text_classification/sample_text.txt"],
-            "handler": "text_classification",
-        },
-        {
-            "name": "alexnet_scripted",
-            "inputs": ["examples/image_classifier/kitten.jpg"],
-            "handler": "image_classifier",
-        },
-        {
-            "name": "fcn_resnet_101_scripted",
-            "inputs": ["examples/image_segmenter/persons.jpg"],
-            "handler": "image_segmenter",
-        },
-        {
-            "name": "distill_bert_qa_eager",
-            "inputs": [
-                "examples/Huggingface_Transformers/QA_artifacts/sample_text.txt"
-            ],
-            "handler": "custom",
-        },
-        {
-            "name": "bert_token_classification_no_torchscript",
-            "inputs": [
-                "examples/Huggingface_Transformers/Token_classification_artifacts/sample_text.txt"
-            ],
-            "handler": "custom",
-        },
-        {
-            "name": "bert_seqc_without_torchscript",
-            "inputs": [
-                "examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
-            ],
-            "handler": "custom",
-        },
-    ]
-
-    if not sys.platform.startswith("win"):
-        models_to_validate.extend(
-            (
-                bert_token_classification_no_torchscript_model,
-                bert_seqc_without_torchscript_model,
-            )
-        )
 
-    ts_log_file = os.path.join("logs", "ts_console.log")
-    is_gpu_instance = utils.is_gpu_instance()
+def run_grpc_test(model: dict):
+    model_name = model["name"]
+    model_inputs = model["inputs"]
 
-    os.makedirs("model_store", exist_ok=True)
-    os.makedirs("logs", exist_ok=True)
+    # Run gRPC sanity
+    print("pass mg.mar_set=", mg.mar_set)
+    mar_set_list_str = [str(s) for s in mg.mar_set]
+    mar_set_str = ",".join(mar_set_list_str)
+    register_model_grpc_cmd = f"python ts_scripts/torchserve_grpc_client.py register {model_name} {mar_set_str}"
+    status = os.system(register_model_grpc_cmd)
 
-    if is_gpu_instance:
-        import torch
+    if status != 0:
+        print("## Failed to register model with torchserve")
+        sys.exit(1)
+    else:
+        print(f"## Successfully registered {model_name} model with torchserve")
+
+    for input in model_inputs:
+        infer_model_grpc_cmd = [
+            "python",
+            "ts_scripts/torchserve_grpc_client.py",
+            "infer",
+            f"{model_name}",
+            f"{input}",
+        ]
+        p = subprocess.run(infer_model_grpc_cmd, capture_output=True, text=True)
+        out = p.stdout.split("\n")
+        print("\n".join(out[:50]))
+        if len(out) > 50:
+            print("<output clipped>")
+
+        if p.returncode != 0:
+            print(f"## Failed to run inference on {model_name} model")
+            sys.exit(1)
+        else:
+            print(f"## Successfully ran inference on {model_name} model.")
 
-        if not torch.cuda.is_available():
-            sys.exit("## Ohh its NOT running on GPU !")
+    unregister_model_grpc_cmd = (
+        f"python ts_scripts/torchserve_grpc_client.py unregister {model_name}"
+    )
+    status = os.system(unregister_model_grpc_cmd)
 
-    started = ts.start_torchserve(log_file=ts_log_file)
-    if not started:
+    if status != 0:
+        print(f"## Failed to unregister {model_name}")
         sys.exit(1)
+    else:
+        print(f"## Successfully unregistered {model_name}")
 
-    for model in models_to_validate:
-        model_name = model["name"]
-        model_inputs = model["inputs"]
-        model_handler = model["handler"]
 
-        # Run gRPC sanity
-        print("pass mg.mar_set=", mg.mar_set)
-        mar_set_list_str = [str(s) for s in mg.mar_set]
-        mar_set_str = ",".join(mar_set_list_str)
-        register_model_grpc_cmd = f"python ts_scripts/torchserve_grpc_client.py register {model_name} {mar_set_str}"
-        status = os.system(register_model_grpc_cmd)
+def run_rest_test(model, register_model=True, unregister_model=True):
+    model_name = model["name"]
+    model_inputs = model["inputs"]
+    model_handler = model["handler"]
 
-        if status != 0:
-            print("## Failed to register model with torchserve")
-            sys.exit(1)
-        else:
+    if register_model:
+        response = ts.register_model(model_name)
+        if response and response.status_code == 200:
             print(f"## Successfully registered {model_name} model with torchserve")
+        else:
+            print(f"## Failed to register {model_name} model with torchserve")
+            sys.exit(1)
 
-        for input in model_inputs:
-            infer_model_grpc_cmd = f"python ts_scripts/torchserve_grpc_client.py infer {model_name} {input}"
-            status = os.system(infer_model_grpc_cmd)
-            if status != 0:
+    # For each input execute inference n=4 times
+    for input in model_inputs:
+        for i in range(4):
+            response = ts.run_inference(model_name, input)
+            if response and response.status_code == 200:
+                print(f"## Successfully ran inference on {model_name} model.")
+            else:
                 print(f"## Failed to run inference on {model_name} model")
                 sys.exit(1)
-            else:
-                print(f"## Successfully ran inference on {model_name} model.")
-
-        unregister_model_grpc_cmd = (
-            f"python ts_scripts/torchserve_grpc_client.py unregister {model_name}"
-        )
-        status = os.system(unregister_model_grpc_cmd)
 
-        if status != 0:
-            print(f"## Failed to unregister {model_name}")
-            sys.exit(1)
+    if torch.cuda.is_available():
+        if validate_model_on_gpu():
+            print(f"## Model {model_name} successfully loaded on GPU")
         else:
-            print(f"## Successfully unregistered {model_name}")
+            sys.exit(
+                f"## Something went wrong, model {model_name} did not load on GPU!!"
+            )
 
-        # Run REST sanity
-        response = ts.register_model(model_name)
+    # skip unregistering resnet-18 model to test snapshot feature with restart
+    if unregister_model:
+        response = ts.unregister_model(model_name)
         if response and response.status_code == 200:
-            print(f"## Successfully registered {model_name} model with torchserve")
+            print(f"## Successfully unregistered {model_name}")
         else:
-            print("## Failed to register model with torchserve")
+            print(f"## Failed to unregister {model_name}")
             sys.exit(1)
 
-        # For each input execute inference n=4 times
-        for input in model_inputs:
-            for i in range(4):
-                response = ts.run_inference(model_name, input)
-                if response and response.status_code == 200:
-                    print(f"## Successfully ran inference on {model_name} model.")
-                else:
-                    print(f"## Failed to run inference on {model_name} model")
-                    sys.exit(1)
-
-        if is_gpu_instance:
-            if validate_model_on_gpu():
-                print(f"## Model {model_name} successfully loaded on GPU")
-            else:
-                sys.exit(
-                    f"## Something went wrong, model {model_name} did not load on GPU!!"
-                )
+    print(f"## {model_handler} handler is stable.")
 
-        # skip unregistering resnet-18 model to test snapshot feature with restart
-        if model != resnet18_model:
-            response = ts.unregister_model(model_name)
-            if response and response.status_code == 200:
-                print(f"## Successfully unregistered {model_name}")
-            else:
-                print(f"## Failed to unregister {model_name}")
-                sys.exit(1)
 
-        print(f"## {model_handler} handler is stable.")
+def test_sanity():
+    generate_grpc_client_stubs()
+
+    print("## Started sanity tests")
+
+    models_to_validate = load_model_to_validate()
+
+    test_gpu_setup()
+
+    ts_log_file = os.path.join("logs", "ts_console.log")
+
+    os.makedirs("model_store", exist_ok=True)
+    os.makedirs("logs", exist_ok=True)
+
+    mg.mar_set = set(os.listdir("model_store"))
+    started = ts.start_torchserve(log_file=ts_log_file, gen_mar=False)
+    if not started:
+        sys.exit(1)
+
+    resnet18_model = models_to_validate["resnet-18"]
+
+    models_to_validate = {
+        k: v for k, v in models_to_validate.items() if k != "resnet-18"
+    }
+
+    for _, model in models_to_validate.items():
+        run_grpc_test(model)
+        run_rest_test(model)
+
+    run_rest_test(resnet18_model, unregister_model=False)
 
     stopped = ts.stop_torchserve()
     if not stopped:
@@ -230,32 +199,16 @@ def test_sanity():
 
     # Restarting torchserve
     # This should restart with the generated snapshot and resnet-18 model should be automatically registered
-    started = ts.start_torchserve(log_file=ts_log_file)
+    started = ts.start_torchserve(log_file=ts_log_file, gen_mar=False)
     if not started:
         sys.exit(1)
 
-    response = ts.run_inference(resnet18_model["name"], resnet18_model["inputs"][0])
-    if response and response.status_code == 200:
-        print(f"## Successfully ran inference on {resnet18_model['name']} model.")
-    else:
-        print(f"## Failed to run inference on {resnet18_model['name']} model")
-        sys.exit(1)
-
-    response = ts.unregister_model(resnet18_model["name"])
-    if response and response.status_code == 200:
-        print(f"## Successfully unregistered {resnet18_model['name']}")
-    else:
-        print(f"## Failed to unregister {resnet18_model['name']}")
-        sys.exit(1)
+    run_rest_test(resnet18_model, register_model=False)
 
     stopped = ts.stop_torchserve()
     if not stopped:
         sys.exit(1)
 
-    links_ok = run_markdown_link_checker()
-    if not links_ok:
-        print("##WARNING : Broken links in docs.")
-
 
 def test_workflow_sanity():
     current_path = os.getcwd()
@@ -300,3 +253,10 @@ def test_workflow_sanity():
     stopped = ts.stop_torchserve()
     if not stopped:
         sys.exit(1)
+
+
+def test_markdown_files():
+    links_ok = run_markdown_link_checker()
+    if not links_ok:
+        print("##WARNING : Broken links in docs.")
+    return links_ok
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 75f2b3a669..04ed1aa80d 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -998,6 +998,7 @@ NVfuser
 fuser
 ort
 sess
+dali
 BetterTransformer
 TransformerEncoder
 InferenceTimeInMS
@@ -1020,3 +1021,38 @@ datapipes
 tensorrt
 vec
 torchdata
+CodeQL
+Dependabot
+Snyk
+pythonversion
+StreamPredictions
+LLMs
+MPS
+mps
+deviceIds
+rpc
+pippy
+MBS
+MicroBatching
+MicroBatchingHandler
+QPS
+PiPPy
+Microbatching
+Micro-batching
+microbatch
+microbatching
+DeviceId
+PredictionTime
+QueueTime
+WorkerLoadTime
+WorkerName
+WorkerThreadTime
+MicroSoft
+lmi
+torchrun
+nproc
+largemodels
+torchpippy
+InferenceSession
+maxRetryTimeoutInSec
+neuronx
\ No newline at end of file
diff --git a/ts_scripts/torchserve_grpc_client.py b/ts_scripts/torchserve_grpc_client.py
index 367f3a6036..ccf293ed3f 100644
--- a/ts_scripts/torchserve_grpc_client.py
+++ b/ts_scripts/torchserve_grpc_client.py
@@ -35,6 +35,23 @@ def infer(stub, model_name, model_input):
         exit(1)
 
 
+def infer_stream(stub, model_name, model_input):
+    with open(model_input, "rb") as f:
+        data = f.read()
+
+    input_data = {"data": data}
+    responses = stub.StreamPredictions(
+        inference_pb2.PredictionsRequest(model_name=model_name, input=input_data)
+    )
+
+    try:
+        for resp in responses:
+            prediction = resp.prediction.decode("utf-8")
+            print(prediction)
+    except grpc.RpcError as e:
+        exit(1)
+
+
 def register(stub, model_name, mar_set_str):
     mar_set = set()
     if mar_set_str:
@@ -93,6 +110,9 @@ def unregister(stub, model_name):
     infer_action_parser = subparsers.add_parser(
         "infer", parents=[parent_parser], add_help=False
     )
+    infer_stream_action_parser = subparsers.add_parser(
+        "infer_stream", parents=[parent_parser], add_help=False
+    )
     register_action_parser = subparsers.add_parser(
         "register", parents=[parent_parser], add_help=False
     )
@@ -104,6 +124,13 @@ def unregister(stub, model_name):
         "model_input", type=str, default=None, help="Input for model for inferencing."
     )
 
+    infer_stream_action_parser.add_argument(
+        "model_input",
+        type=str,
+        default=None,
+        help="Input for model for stream inferencing.",
+    )
+
     register_action_parser.add_argument(
         "mar_set",
         type=str,
@@ -116,6 +143,8 @@ def unregister(stub, model_name):
 
     if args.action == "infer":
         infer(get_inference_stub(), args.model_name, args.model_input)
+    elif args.action == "infer_stream":
+        infer_stream(get_inference_stub(), args.model_name, args.model_input)
     elif args.action == "register":
         register(get_management_stub(), args.model_name, args.mar_set)
     elif args.action == "unregister":
diff --git a/ts_scripts/tsutils.py b/ts_scripts/tsutils.py
index 77687611e9..6fb1f6b42f 100644
--- a/ts_scripts/tsutils.py
+++ b/ts_scripts/tsutils.py
@@ -1,66 +1,105 @@
 import os
 import platform
 import sys
-import time
+import threading
+from pathlib import Path
+from subprocess import PIPE, STDOUT, Popen
+
 import requests
+
 from ts_scripts import marsgen as mg
 
 torchserve_command = {
     "Windows": "torchserve.exe",
     "Darwin": "torchserve",
-    "Linux": "torchserve"
+    "Linux": "torchserve",
 }
 
 torch_model_archiver_command = {
-        "Windows": "torch-model-archiver.exe",
-        "Darwin": "torch-model-archiver",
-        "Linux": "torch-model-archiver"
-    }
+    "Windows": "torch-model-archiver.exe",
+    "Darwin": "torch-model-archiver",
+    "Linux": "torch-model-archiver",
+}
 
 torch_workflow_archiver_command = {
-        "Windows": "torch-workflow-archiver.exe",
-        "Darwin": "torch-workflow-archiver",
-        "Linux": "torch-workflow-archiver"
-    }
+    "Windows": "torch-workflow-archiver.exe",
+    "Darwin": "torch-workflow-archiver",
+    "Linux": "torch-workflow-archiver",
+}
+
+
+class LogPipeTillTheEnd(threading.Thread):
+    def __init__(self, pipe, log_file):
+        super().__init__()
+        self.pipe = pipe
+        self.log_file = log_file
+
+    def run(self):
+        with open(self.log_file, "a") as f:
+            for line in self.pipe.stdout:
+                f.write(line.decode("utf-8"))
 
 
 def start_torchserve(
-        ncs=False, model_store="model_store", workflow_store="",
-        models="", config_file="", log_file="", wait_for=10, gen_mar=True):
+    ncs=False,
+    model_store="model_store",
+    workflow_store="",
+    models="",
+    config_file="",
+    log_file="",
+    gen_mar=True,
+):
     if gen_mar:
         mg.gen_mar(model_store)
     print("## Starting TorchServe")
-    cmd = f"{torchserve_command[platform.system()]} --start --model-store={model_store}"
+    cmd = [f"{torchserve_command[platform.system()]}"]
+    cmd.append("--start")
+    cmd.append(f"--model-store={model_store}")
     if models:
-        cmd += f" --models={models}"
+        cmd.append(f"--models={models}")
     if workflow_store:
-        cmd += f" --workflow-store={workflow_store}"
+        cmd.append(f"--workflow-store={workflow_store}")
     if ncs:
-        cmd += " --ncs"
+        cmd.append("--ncs")
     if config_file:
-        cmd += f" --ts-config={config_file}"
+        cmd.append(f"--ts-config={config_file}")
     if log_file:
         print(f"## Console logs redirected to file: {log_file}")
-        cmd += f" >> {log_file}"
-    print(f"## In directory: {os.getcwd()} | Executing command: {cmd}")
-    status = os.system(cmd)
+    print(f"## In directory: {os.getcwd()} | Executing command: {' '.join(cmd)}")
+    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
+    if log_file:
+        Path(log_file).parent.absolute().mkdir(parents=True, exist_ok=True)
+        with open(log_file, "a") as f:
+            for line in p.stdout:
+                f.write(line.decode("utf-8"))
+                if "Model server started" in str(line).strip():
+                    break
+        t = LogPipeTillTheEnd(p, log_file)
+        t.start()
+    else:
+        for line in p.stdout:
+            if "Model server started" in str(line).strip():
+                break
+
+    status = p.poll()
     if status == 0:
         print("## Successfully started TorchServe")
-        time.sleep(wait_for)
         return True
     else:
         print("## TorchServe failed to start !")
         return False
 
 
-def stop_torchserve(wait_for=10):
+def stop_torchserve():
     print("## Stopping TorchServe")
-    cmd = f"{torchserve_command[platform.system()]} --stop"
+    cmd = [f"{torchserve_command[platform.system()]}"]
+    cmd.append("--stop")
     print(f"## In directory: {os.getcwd()} | Executing command: {cmd}")
-    status = os.system(cmd)
+    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
+
+    status = p.wait()
     if status == 0:
         print("## Successfully stopped TorchServe")
-        time.sleep(wait_for)
         return True
     else:
         print("## TorchServe failed to stop !")
@@ -86,7 +125,9 @@ def register_model(model_name, protocol="http", host="localhost", port="8081"):
     return response
 
 
-def run_inference(model_name, file_name, protocol="http", host="localhost", port="8080", timeout=120):
+def run_inference(
+    model_name, file_name, protocol="http", host="localhost", port="8080", timeout=120
+):
     print(f"## Running inference on {model_name} model")
     url = f"{protocol}://{host}:{port}/predictions/{model_name}"
     files = {"data": (file_name, open(file_name, "rb"))}
@@ -103,9 +144,11 @@ def unregister_model(model_name, protocol="http", host="localhost", port="8081")
 
 def generate_grpc_client_stubs():
     print("## Started generating gRPC clinet stubs")
-    cmd = "python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts " \
-          "--grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto " \
-          "frontend/server/src/main/resources/proto/management.proto"
+    cmd = (
+        "python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts "
+        "--grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto "
+        "frontend/server/src/main/resources/proto/management.proto"
+    )
     status = os.system(cmd)
     if status != 0:
         print("Could not generate gRPC client stubs")
@@ -115,9 +158,7 @@ def generate_grpc_client_stubs():
 def register_workflow(workflow_name, protocol="http", host="localhost", port="8081"):
     print(f"## Registering {workflow_name} workflow")
     model_zoo_url = "https://torchserve.s3.amazonaws.com"
-    params = (
-        ("url", f"{model_zoo_url}/war_files/{workflow_name}.war"),
-    )
+    params = (("url", f"{model_zoo_url}/war_files/{workflow_name}.war"),)
     url = f"{protocol}://{host}:{port}/workflows"
     response = requests.post(url, params=params, verify=False)
     return response
@@ -130,7 +171,14 @@ def unregister_workflow(workflow_name, protocol="http", host="localhost", port="
     return response
 
 
-def workflow_prediction(workflow_name, file_name, protocol="http", host="localhost", port="8080", timeout=120):
+def workflow_prediction(
+    workflow_name,
+    file_name,
+    protocol="http",
+    host="localhost",
+    port="8080",
+    timeout=120,
+):
     print(f"## Running inference on {workflow_name} workflow")
     url = f"{protocol}://{host}:{port}/wfpredict/{workflow_name}"
     files = {"data": (file_name, open(file_name, "rb"))}
diff --git a/workflow-archiver/workflow_archiver/version.txt b/workflow-archiver/workflow_archiver/version.txt
index b0032849c8..a45be46276 100644
--- a/workflow-archiver/workflow_archiver/version.txt
+++ b/workflow-archiver/workflow_archiver/version.txt
@@ -1 +1 @@
-0.2.7
+0.2.8