ISE-Sustainable-AI · witja46 · Aug 2, 2022 · Aug 3, 2022 · Aug 4, 2022 · Aug 4, 2022
diff --git a/.gitignore b/.gitignore
@@ -131,5 +131,6 @@ dmypy.json
 exp__*
 experiments/simple_raytune/benchmark__RaytuneBenchmark
 experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
+experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark
 
 data/
diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml
@@ -0,0 +1,29 @@
+version: 1.1
+kind: operation
+name: $study_name
+matrix:
+  kind: grid
+  numRuns: $jobs_num
+  concurrency: $worker_num
+  params:
+    lr:
+      kind: logspace
+      value: 0.01:0.1:$jobs_num
+component:
+  inputs:
+    - name: lr
+      type: float
+  run:
+    kind: job
+    container:
+      imagePullPolicy: IfNotPresent
+      image: $worker_image
+      command: [python3, experiments/polyaxon_minikube/$worker_image/task.py]
+      args: ["--batch-size=64", "--lr={{ lr }}"]
+      env:
+        - name: STUDY_NAME
+          value: "$study_name"
+        - name: DB_CONN
+          value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+        - name: "METRICS_STORAGE_HOST"
+          value: "$metrics_ip"
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
@@ -0,0 +1,29 @@
+version: 1.1
+kind: operation
+name: polyaxon-study-57
+matrix:
+  kind: grid
+  numRuns: 15
+  concurrency: 5
+  params:
+    lr:
+      kind: logspace
+      value: 0.01:0.1:15
+component:
+  inputs:
+    - name: lr
+      type: float
+  run:
+    kind: job
+    container:
+      imagePullPolicy: IfNotPresent
+      image: task_light
+      command: [python3, experiments/polyaxon_minikube/task_light/task.py]
+      args: ["--batch-size=64", "--lr={{ lr }}"]
+      env:
+        - name: STUDY_NAME
+          value: "polyaxon-study-57"
+        - name: DB_CONN
+          value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+        - name: "METRICS_STORAGE_HOST"
+          value: "134.101.4.161"
diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile
@@ -0,0 +1,17 @@
+#TODO change to some slimer base image
+FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
+#FROM python:3.9-slim
+
+RUN pip install pip --upgrade
+RUN pip install polyaxon
+
+COPY experiments experiments
+#COPY data data
+COPY setup.py setup.py
+COPY ml_benchmark ml_benchmark
+
+
+
+RUN pip install -e .
+
+CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"]
diff --git a/experiments/polyaxon_minikube/mnist_task/README.md b/experiments/polyaxon_minikube/mnist_task/README.md
@@ -0,0 +1,11 @@
+# PyTorch MNIST Image Classification Example
+
+This is PyTorch MNIST image classification training container with saving metrics
+to the file or printing to the StdOut. It uses convolutional neural network to
+train the model.
+
+Katib uses this training container in some Experiments, for instance in the
+[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
+the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
+the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
+and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
diff --git a/experiments/polyaxon_minikube/mnist_task/requirements.txt b/experiments/polyaxon_minikube/mnist_task/requirements.txt
@@ -0,0 +1,3 @@
+Pillow>=9.1.1
+numpy
+polyaxon
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -0,0 +1,85 @@
+from polyaxon import tracking
+import argparse
+import logging
+import numpy as np
+import time
+import os
+import sys
+PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../"))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.workload.mnist.mnist_task import MnistTask
+
+
+def train(times ,epoch):
+    loss = 1
+    for x in np.arange(1,times + 1): 
+        loss = 1 - (x -1 ) / times
+        time.sleep(0.1)
+        msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                epoch, x , times,
+                100. * x / times, loss)
+        logging.info(msg)
+
+
+def test():
+
+    test_accuracy =0.8 
+    logging.info("Validation-accuracy={:.4f}\n".format(
+        test_accuracy))
+
+
+
+def main():
+
+
+    #parsing arguments 
+    parser = argparse.ArgumentParser(description="MNIST Example")
+    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="number of epochs to train (default: 10)")
+    parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
+                        help="learning rate (default: 0.01)")
+    args = parser.parse_args()
+    epochs = args.epochs
+    batch_size = args.batch_size
+    lr = args.lr
+
+
+
+
+
+
+    #MnistTask
+    task = MnistTask(config_init={"epochs": epochs})
+    objective = task.create_objective()
+    #TODO add the weight decay to the definition of the template
+    objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
+    objective.train()
+    validation_scores = objective.validate()
+
+
+    #Geting results
+    avg = validation_scores["weighted avg"]
+    print("precision",avg["precision"])
+    print("f1-score",avg["f1-score"])
+
+    # logging.basicConfig(
+    #         format="%(asctime)s %(levelname)-8s %(message)s",
+    #         datefmt="%Y-%m-%dT%H:%M:%SZ",
+    #         level=logging.DEBUG)
+    # for epoch in np.arange(1,epochs+1):
+    #     train(batch_size,epoch)
+    #     test() 
+
+    # Polyaxon
+    #initiating polyaxon tracking
+    tracking.init()    
+    #loging metrics 
+    tracking.log_metrics(recall=avg["recall"], )
+    #logging the results 
+    tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"])
+
+
+if __name__ == "__main__":
+    main()