From 4cf5c927ade51b72178379af5ba1664ff7ed89ed Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 2 Aug 2022 14:23:13 +0200
Subject: [PATCH 1/9] Redone standalone polyaxon script

---
 .../experiment_template.yaml                  |  21 ++
 experiments/polyaxon_minikube/grid.yaml       |  21 ++
 .../polyaxon_minikube/mnist_task/Dockerfile   |  15 +
 .../polyaxon_minikube/mnist_task/README.md    |  11 +
 .../mnist_task/requirements.txt               |   3 +
 .../polyaxon_minikube/mnist_task/task.py      |  66 ++++
 .../polyaxon_minikube/polyaxon_benchmark.py   | 334 ++++++++++++++++++
 7 files changed, 471 insertions(+)
 create mode 100644 experiments/polyaxon_minikube/experiment_template.yaml
 create mode 100644 experiments/polyaxon_minikube/grid.yaml
 create mode 100644 experiments/polyaxon_minikube/mnist_task/Dockerfile
 create mode 100644 experiments/polyaxon_minikube/mnist_task/README.md
 create mode 100644 experiments/polyaxon_minikube/mnist_task/requirements.txt
 create mode 100644 experiments/polyaxon_minikube/mnist_task/task.py
 create mode 100644 experiments/polyaxon_minikube/polyaxon_benchmark.py

diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml
new file mode 100644
index 0000000..507a2a5
--- /dev/null
+++ b/experiments/polyaxon_minikube/experiment_template.yaml
@@ -0,0 +1,21 @@
+version: 1.1
+kind: operation
+name: $study_name
+matrix:
+  kind: grid
+  numRuns: $jobs_num
+  concurrency: $worker_num
+  params:
+    lr:
+      kind: logspace
+      value: 0.01:0.1:$jobs_num
+component:
+  inputs:
+    - name: lr
+      type: float
+  run:
+    kind: job
+    container:
+      image: $worker_image
+      command: [python3, /opt/mnist_task/task.py]
+      args: ["--batch-size=64", "--lr={{ lr }}"]
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
new file mode 100644
index 0000000..a0e9625
--- /dev/null
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -0,0 +1,21 @@
+version: 1.1
+kind: operation
+name: polyaxon-study-72
+matrix:
+  kind: grid
+  numRuns: 5
+  concurrency: 5
+  params:
+    lr:
+      kind: logspace
+      value: 0.01:0.1:5
+component:
+  inputs:
+    - name: lr
+      type: float
+  run:
+    kind: job
+    container:
+      image: witja46/mnist_polyaxon:latest
+      command: [python3, /opt/mnist_task/task.py]
+      args: ["--batch-size=64", "--lr={{ lr }}"]
diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile
new file mode 100644
index 0000000..1cbfd5b
--- /dev/null
+++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.9-slim
+
+ADD . /opt/mnist_task
+WORKDIR /opt/mnist_task
+
+# Add folder for the logs.
+RUN mkdir /katib
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN chgrp -R 0 /opt/mnist_task \
+  && chmod -R g+rwX /opt/mnist_task \
+  && chgrp -R 0 /katib \
+  && chmod -R g+rwX /katib
+
+ENTRYPOINT ["python3", "/opt/mnist_task/task.py"]
diff --git a/experiments/polyaxon_minikube/mnist_task/README.md b/experiments/polyaxon_minikube/mnist_task/README.md
new file mode 100644
index 0000000..3326bc7
--- /dev/null
+++ b/experiments/polyaxon_minikube/mnist_task/README.md
@@ -0,0 +1,11 @@
+# PyTorch MNIST Image Classification Example
+
+This is PyTorch MNIST image classification training container with saving metrics
+to the file or printing to the StdOut. It uses convolutional neural network to
+train the model.
+
+Katib uses this training container in some Experiments, for instance in the
+[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
+the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
+the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
+and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
diff --git a/experiments/polyaxon_minikube/mnist_task/requirements.txt b/experiments/polyaxon_minikube/mnist_task/requirements.txt
new file mode 100644
index 0000000..95b0e5e
--- /dev/null
+++ b/experiments/polyaxon_minikube/mnist_task/requirements.txt
@@ -0,0 +1,3 @@
+Pillow>=9.1.1
+numpy
+polyaxon
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
new file mode 100644
index 0000000..d18bae3
--- /dev/null
+++ b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -0,0 +1,66 @@
+from __future__ import print_function
+
+from polyaxon import tracking
+import argparse
+import logging
+import os
+import numpy as np
+import time
+
+def train(times ,epoch):
+    loss = 1
+    for x in np.arange(1,times + 1): 
+        loss = 1 - (x -1 ) / times
+        time.sleep(0.01)
+        msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                epoch, x , times,
+                100. * x / times, loss)
+        logging.info(msg)
+
+
+def test():
+
+    test_accuracy =0.8 
+    logging.info("Validation-accuracy={:.4f}\n".format(
+        test_accuracy))
+
+
+
+def main():
+    
+    #Polyaxon
+    tracking.init()
+    
+    #parsing arguments 
+    parser = argparse.ArgumentParser(description="MNIST Example")
+    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="number of epochs to train (default: 10)")
+    parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
+                        help="learning rate (default: 0.01)")
+    args = parser.parse_args()
+  
+    #timestamps format
+    logging.basicConfig(
+            format="%(asctime)s %(levelname)-8s %(message)s",
+            datefmt="%Y-%m-%dT%H:%M:%SZ",
+            level=logging.DEBUG)
+
+   
+    #model taining and testing
+    epochs = args.epochs
+    batch_size = args.batch_size
+    lr = args.lr
+    for epoch in np.arange(1,epochs+1):
+        train(batch_size,epoch)
+        test() 
+
+
+    #polyaxon 
+    tracking.log_metrics(met1=0.8, met2= 0.9)
+    tracking.log_outputs(loss=0.9,accuracy=0.6)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
new file mode 100644
index 0000000..0295e21
--- /dev/null
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -0,0 +1,334 @@
+
+from asyncio import subprocess
+from concurrent.futures import process
+from itertools import count
+import json
+from os import path 
+import os
+import sys
+from time import sleep
+import random
+from kubernetes import client, config,watch
+from kubernetes.client.rest import ApiException
+from string import Template
+import yaml
+import docker
+PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../.."))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.benchmark_runner import Benchmark
+import requests
+import subprocess
+import psutil
+import logging as log
+
+
+
+class PolyaxonBenchmark(Benchmark):
+
+    def __init__(self, objective, grid, resources) -> None:
+        self.objective = objective
+        self.grid = grid
+        self.resources = resources
+        self.group="kubeflow.org"
+        self.version="v1beta1"
+        self.namespace='polyaxon'
+        self.plural="experiments"
+        self.experiment_file_name = "grid.yaml"
+        self.project_description = "Somer random description"
+        self.polyaxon_addr="http://localhost:31833/"
+
+        config.load_kube_config()
+
+        
+        log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=log.INFO)
+
+    
+        if "dockerImageTag" in self.resources:
+            self.trial_tag = self.resources["dockerImageTag"]
+        else:
+            self.trial_tag = "mnist_polyaxon:latest"
+
+        if "dockerUserLogin" in self.resources:
+            self.docker_user = self.resources["dockerUserLogin"]
+            self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
+        else:
+            self.docker_user = ""
+            self.trial_tag = "witja46/mnist_polyaxon:latest"
+        
+        if "dockerUserPassword" in self.resources:
+            self.docker_pasword = self.resources["dockerUserPassword"]
+        else:
+            self.docker_pasword = ""        
+
+
+        if "studyName" in self.resources:
+            self.study_name = self.resources["studyName"]
+        else:
+            self.study_name = f"polyaxon-study-{random.randint(0, 100)}"
+
+        if "workerCpu" in self.resources:
+            self.workerCpu = self.resources["workerCpu"]
+        else:
+            self.workerCpu = 2
+
+        if "workerMemory" in resources:
+            self.workerMemory = self.resources["workerMemory"]
+        else:
+            self.workerMemory = 2
+
+        if "workerCount" in resources:
+            self.workerCount = self.resources["workerCount"]
+        else:
+            self.workerCount = 5
+
+        if "jobsCount" in resources:
+            self.jobsCount = self.resources["jobsCount"]
+        else:
+            self.jobsCount = 6
+        
+    def deploy(self):
+        """
+            With the completion of this step the desired architecture of the HPO Framework should be running
+            on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods
+            and services in kubernetes.
+        """
+        
+        log.info("Adding polyaxon to helm repo:")
+        res = os.popen('helm repo add polyaxon https://charts.polyaxon.com').read()
+        log.info(res)
+
+        #TODO deploy via helm for better error handling and easier configuration?
+        log.info("Deploying polyaxon to minikube:")
+        res = os.popen('polyaxon admin deploy -t minikube').read()
+        log.info(res)
+
+      
+      
+        
+
+        config.load_kube_config()
+        w = watch.Watch()
+        c = client.CoreV1Api()
+        # c = client.AppsV1Api()
+        deployed = 0
+
+        log.info("Waiting for polyaxon pods to be read:")
+        monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
+        # TODO changing to list_namespaced_deployments?
+        for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"):
+            ob = e["object"]          
+            
+            for name in monitored_pods:
+                if name in ob.metadata.name:
+                    #   log.info(ob.metadata.name)
+                    #log.info(ob)
+
+                    if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: 
+                    # if ob.status.ready_replicas is not None and ob.status.ready_replicas >= 1:
+                        log.info(f'{ob.metadata.name} is ready')
+                        monitored_pods.remove(name)
+                        deployed = deployed + 1
+                    #   log.info(ob)
+                        if(deployed == 4 ):
+                            w.stop()
+                            log.info("Finished deploying")
+
+                            res = os.popen('kubectl get deployment -n polyaxon').read()
+                            log.info(res)
+                            
+
+        
+  
+
+        # Starting post forwarding to the polyaxon api in the background
+        log.info("Starting post-forward to polyaxon api:")
+        self.post_forward_process = subprocess.Popen("kubectl port-forward  svc/polyaxon-polyaxon-api 31833:80  -n polyaxon",shell=True,stdout=subprocess.PIPE)
+
+
+
+
+      
+        
+    def setup(self):
+        """
+        Every Operation that is needed before the actual optimization (trial) starts and that is not relevant
+        for starting up workers or the necessary architecture.
+        """
+       
+        #creating experiment yaml          
+             
+        experiment_definition = {
+            "worker_num": self.workerCount,
+            "jobs_num":self.jobsCount,
+            "worker_cpu": self.workerCpu,
+            "worker_mem": f"{self.workerMemory}Gi",
+            "worker_image": self.trial_tag,
+            "study_name": self.study_name,
+            "trialParameters":"${trialParameters.learningRate}"
+        }
+
+        #loading and filling the template
+        with open(path.join(path.dirname(__file__), "experiment_template.yaml"), "r") as f:
+            job_template = Template(f.read())
+            job_yml_objects = job_template.substitute(experiment_definition)
+            
+        #writing the experiment definition into the file        
+        with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f:
+            f.write(job_yml_objects)
+        log.info("Experiment yaml created")
+      
+      
+        # Creating new docker image if credentials were passed
+        if "dockerUserLogin" in self.resources:
+           
+            #creating task docker image  
+            log.info("Creating task docker image")  
+            self.client = docker.client.from_env()
+            image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag)
+            log.info(f"Image: {self.trial_tag}")
+            for line in logs  :
+                log.info(line) 
+            
+            #pushing to repo
+            self.client.login(username=self.docker_user, password=self.docker_pasword)
+            for line in self.client.images.push(self.trial_tag, stream=True, decode=True):
+                log.info(line) 
+        
+        
+
+    def run(self):
+
+        #TODO add error handling 
+        log.info("Creating new project:")
+        project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, "description": self.project_description})
+        log.info(project.text)
+
+
+        #TODO find out where to pass the --eager flag so that the run can be created with help of api
+        # with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f:
+        #     body = f.read()
+        #     log.info(body)
+        #     run = requests.post(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs',json={"content":body,"eager":True ,"tags":["--eager"],"meta_info":{"eager":True,"--eager":True,"flags":"--eager"}})
+        #     log.info(run.text)
+      
+      
+        log.info("Starting polyaxon experiment:")
+        res = os.popen(f'polyaxon run -f ./{self.experiment_file_name} --project {self.study_name} --eager').read()
+        log.info(res)
+
+        
+        #TODO change to kubernetes api for monitoring runing trials  
+        log.info("Waiting for the run to finish:")
+        finished = False
+        while not finished:
+            runs = self.get_succeeded_runs()
+            log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded')
+            
+            #checking if all runs were finished
+            finished = runs["count"] == self.jobsCount
+            sleep(1)
+        return 
+
+
+    def collect_benchmark_metrics(self):
+        pass
+
+
+
+    def get_succeeded_runs(self, sort_by="duration"):
+        
+        #TODO add error handling acording to polyaxon api 
+        res = requests.get(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs?query=status:succeeded&sort={sort_by}') 
+        result = json.loads(res.text)
+        return result
+
+    def collect_run_results(self):
+        
+
+        log.info("Collecting run results:")
+        result = self.get_succeeded_runs()
+        log.info(json.dumps(result,indent=4))               
+               
+        # log.info("\n Experiment finished with following optimal trial:")
+        # log.info(result["results"][0])
+        return result["results"]
+    
+    def test(self):
+        return super().test()
+
+    def undeploy(self):
+        
+        log.info("Terminating post  forwarding process:")
+        process = psutil.Process(self.post_forward_process.pid)
+        for proc in process.children(recursive=True):
+            proc.kill()
+        process.kill()
+      
+
+
+        #TODO changing to undeploying with helm?
+        log.info("Undeploying polyaxon:")
+        p = subprocess.Popen(["polyaxon", "admin" ,"teardown"],shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
+        out,err = p.communicate(input=b"y") 
+        p.terminate()
+             
+    
+
+
+        # Waiting untill all polyaxon pods get terminated 
+        config.load_kube_config()
+        w = watch.Watch()
+        c = client.CoreV1Api()
+        deployed = 0
+        log.info("Waiting for polyaxon pods to be terminated:")
+        for e in w.stream(c.list_namespaced_pod, namespace=self.namespace):
+            ob = e["object"]
+               
+            log.debug(f'{deployed} pods out of 4 were killed')
+            log.debug("\n new in stream:\n")
+            log.debug(ob.metadata.name,ob.status.phase)
+
+            if not ob.status.container_statuses[0].ready:
+                log.info(f'Containers of {ob.metadata.name} are terminated')
+                deployed = deployed + 1
+                if(deployed == 4 ):
+                    w.stop()
+                    # log.info("Finished ")
+                    break
+        
+      
+        log.info("Killed all pods deleteing the namespace:")
+        res = c.delete_namespace_with_http_info(name=self.namespace)
+        
+
+        #TODO somehow handel the timouts?
+        log.info("Checking status of the deleted namespace:")  
+        for e in w.stream(c.list_namespace):
+            ob = e["object"]
+            # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found
+            #TODO do this in other way
+            if ob.metadata.name == self.namespace:
+                try:
+                    log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
+                except ApiException as err:
+                    log.info(err)
+                    w.stop()
+                    break
+
+        log.info("Finished undeploying")
+
+
+if __name__ == "__main__":
+    #main()
+    bench = PolyaxonBenchmark(1,1,resources={
+        #    "dockerUserLogin":"",
+        #    "dockerUserPassword":"",
+        # "studyName":""
+        "jobsCount":5,
+        "workerCount":5
+        })
+    bench.deploy() 
+    bench.setup()
+    bench.run()
+    bench.collect_run_results()
+    bench.undeploy()

From df16ec56d67e7c6a8869b8e87c35f4fbeea81487 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 3 Aug 2022 13:18:12 +0200
Subject: [PATCH 2/9] Added some comments, default logging level changed to
 critical

---
 experiments/polyaxon_minikube/grid.yaml       |  2 +-
 .../polyaxon_minikube/mnist_task/task.py      | 16 ++++++++--
 .../polyaxon_minikube/polyaxon_benchmark.py   | 30 ++++++++++---------
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index a0e9625..7a681ad 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-72
+name: polyaxon-study-88
 matrix:
   kind: grid
   numRuns: 5
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
index d18bae3..aa6eb13 100644
--- a/experiments/polyaxon_minikube/mnist_task/task.py
+++ b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -1,11 +1,15 @@
-from __future__ import print_function
-
 from polyaxon import tracking
 import argparse
 import logging
 import os
 import numpy as np
 import time
+import sys
+PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../.."))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.benchmark_runner import Benchmark
+from ml_benchmark.workload.mnist.mnist_task import MnistTask
+
 
 def train(times ,epoch):
     loss = 1
@@ -56,6 +60,14 @@ def main():
         train(batch_size,epoch)
         test() 
 
+    #TODO swicht to the real task 
+    # task = MnistTask(config_init={"epochs": 1})
+    # objective = task.create_objective()
+    
+    # objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
+    # objective.train()
+    # validation_scores = objective.validate()
+    # print(validation_scores)
 
     #polyaxon 
     tracking.log_metrics(met1=0.8, met2= 0.9)
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index 0295e21..c1089cd 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -36,11 +36,12 @@ def __init__(self, objective, grid, resources) -> None:
         self.experiment_file_name = "grid.yaml"
         self.project_description = "Somer random description"
         self.polyaxon_addr="http://localhost:31833/"
-
+        
         config.load_kube_config()
 
-        
-        log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=log.INFO)
+        self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
+
+        log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=self.logging_level)
 
     
         if "dockerImageTag" in self.resources:
@@ -112,29 +113,29 @@ def deploy(self):
         # c = client.AppsV1Api()
         deployed = 0
 
+
         log.info("Waiting for polyaxon pods to be read:")
+        # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments 
         monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
         # TODO changing to list_namespaced_deployments?
         for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"):
             ob = e["object"]          
             
             for name in monitored_pods:
-                if name in ob.metadata.name:
-                    #   log.info(ob.metadata.name)
-                    #log.info(ob)
 
+                #checking if it is one of the pods that we want to monitor 
+                if name in ob.metadata.name:
+                    
+                    # Checking if the pod already is runnig and its underlying containers are ready
                     if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: 
-                    # if ob.status.ready_replicas is not None and ob.status.ready_replicas >= 1:
                         log.info(f'{ob.metadata.name} is ready')
                         monitored_pods.remove(name)
                         deployed = deployed + 1
-                    #   log.info(ob)
+
+                        #if all monitored pods are running the deployment process was ended
                         if(deployed == 4 ):
                             w.stop()
-                            log.info("Finished deploying")
-
-                            res = os.popen('kubectl get deployment -n polyaxon').read()
-                            log.info(res)
+                            log.info("Finished deploying crucial pods")
                             
 
         
@@ -217,7 +218,7 @@ def run(self):
         log.info(res)
 
         
-        #TODO change to kubernetes api for monitoring runing trials  
+        #TODO switch to kubernetes api for monitoring runing trials  
         log.info("Waiting for the run to finish:")
         finished = False
         while not finished:
@@ -325,7 +326,8 @@ def undeploy(self):
         #    "dockerUserPassword":"",
         # "studyName":""
         "jobsCount":5,
-        "workerCount":5
+        "workerCount":5,
+        "loggingLevel":log.INFO
         })
     bench.deploy() 
     bench.setup()

From 6def8e90810b4115e7801d5ed8934642e40768b9 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Thu, 4 Aug 2022 14:25:25 +0200
Subject: [PATCH 3/9] Polyaxon now runs with help of BenchmarkRunner, changed
 logging style, defualt logginglevel == INFO

---
 ...mark_results__2022-08-04_13-00-24__id.json |  1 +
 ...mark_results__2022-08-04_13-06-41__id.json |  1 +
 ...mark_results__2022-08-04_14-06-58__id.json |  1 +
 experiments/polyaxon_minikube/grid.yaml       |  2 +-
 .../polyaxon_minikube/mnist_task/task.py      | 13 ++--
 .../polyaxon_minikube/polyaxon_benchmark.py   | 62 +++++++++++++------
 ml_benchmark/__init__.py                      |  2 +-
 7 files changed, 53 insertions(+), 29 deletions(-)
 create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
 create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
 create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json

diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
new file mode 100644
index 0000000..e898192
--- /dev/null
+++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
@@ -0,0 +1 @@
+{"benchmark_metrics": {"latency": [{"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_145644815288", "function_name": "deploy", "start_time": "2022-08-04 13:00:27.963286", "end_time": "2022-08-04 13:02:02.040017", "duration_sec": 94.076731}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_145644815288", "function_name": "setup", "start_time": "2022-08-04 13:02:02.103506", "end_time": "2022-08-04 13:02:02.108233", "duration_sec": 0.004727}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_145644815288", "function_name": "run", "start_time": "2022-08-04 13:02:02.131942", "end_time": "2022-08-04 13:02:25.957963", "duration_sec": 23.826021}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_145644815288", "function_name": "collect_run_results", "start_time": "2022-08-04 13:02:25.981388", "end_time": "2022-08-04 13:02:25.994767", "duration_sec": 0.013379}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_145644815288", "function_name": "test", "start_time": "2022-08-04 13:02:26.015907", "end_time": "2022-08-04 13:02:26.015907", "duration_sec": 0.0}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_145644815288", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:02:26.035917", "end_time": "2022-08-04 13:02:26.035917", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
new file mode 100644
index 0000000..b14a3fa
--- /dev/null
+++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
@@ -0,0 +1 @@
+{"benchmark_metrics": {"latency": [{"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_133265314672", "function_name": "deploy", "start_time": "2022-08-04 13:06:45.483067", "end_time": "2022-08-04 13:08:15.610367", "duration_sec": 90.1273}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_133265314672", "function_name": "setup", "start_time": "2022-08-04 13:08:15.663572", "end_time": "2022-08-04 13:08:15.664632", "duration_sec": 0.00106}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_133265314672", "function_name": "run", "start_time": "2022-08-04 13:08:15.687857", "end_time": "2022-08-04 13:08:39.550174", "duration_sec": 23.862317}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_133265314672", "function_name": "collect_run_results", "start_time": "2022-08-04 13:08:39.576174", "end_time": "2022-08-04 13:08:39.588927", "duration_sec": 0.012753}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_133265314672", "function_name": "test", "start_time": "2022-08-04 13:08:39.611043", "end_time": "2022-08-04 13:08:39.611043", "duration_sec": 0.0}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_133265314672", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:08:39.633044", "end_time": "2022-08-04 13:08:39.633044", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 50}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json
new file mode 100644
index 0000000..2de26d8
--- /dev/null
+++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json
@@ -0,0 +1 @@
+{"benchmark_metrics": {"latency": [{"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_125399655361", "function_name": "deploy", "start_time": "2022-08-04 14:07:02.174784", "end_time": "2022-08-04 14:08:31.542018", "duration_sec": 89.367234}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_125399655361", "function_name": "setup", "start_time": "2022-08-04 14:08:31.588224", "end_time": "2022-08-04 14:08:36.247162", "duration_sec": 4.658938}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_125399655361", "function_name": "run", "start_time": "2022-08-04 14:08:36.268226", "end_time": "2022-08-04 14:08:59.486441", "duration_sec": 23.218215}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_125399655361", "function_name": "collect_run_results", "start_time": "2022-08-04 14:08:59.507670", "end_time": "2022-08-04 14:08:59.521064", "duration_sec": 0.013394}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_125399655361", "function_name": "test", "start_time": "2022-08-04 14:08:59.543152", "end_time": "2022-08-04 14:08:59.543152", "duration_sec": 0.0}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_125399655361", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:08:59.562769", "end_time": "2022-08-04 14:08:59.562769", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"dockerUserLogin": "witja46", "dockerUserPassword": "J$rmakowicz1998", "jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 7a681ad..70d540d 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-88
+name: polyaxon-study-15
 matrix:
   kind: grid
   numRuns: 5
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
index aa6eb13..7fd4d46 100644
--- a/experiments/polyaxon_minikube/mnist_task/task.py
+++ b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -1,14 +1,13 @@
 from polyaxon import tracking
 import argparse
 import logging
-import os
 import numpy as np
 import time
-import sys
-PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../.."))
-sys.path.append(PROJECT_ROOT)
-from ml_benchmark.benchmark_runner import Benchmark
-from ml_benchmark.workload.mnist.mnist_task import MnistTask
+# import os
+# PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../.."))
+# sys.path.append(PROJECT_ROOT)
+# from ml_benchmark.benchmark_runner import Benchmark
+# from ml_benchmark.workload.mnist.mnist_task import MnistTask
 
 
 def train(times ,epoch):
@@ -60,7 +59,7 @@ def main():
         train(batch_size,epoch)
         test() 
 
-    #TODO swicht to the real task 
+    # #TODO swicht to the real task 
     # task = MnistTask(config_init={"epochs": 1})
     # objective = task.create_objective()
     
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index c1089cd..d8e92ac 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -5,16 +5,14 @@
 import json
 from os import path 
 import os
+from socket import timeout
 import sys
 from time import sleep
 import random
 from kubernetes import client, config,watch
 from kubernetes.client.rest import ApiException
 from string import Template
-import yaml
 import docker
-PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../.."))
-sys.path.append(PROJECT_ROOT)
 from ml_benchmark.benchmark_runner import Benchmark
 import requests
 import subprocess
@@ -25,9 +23,9 @@
 
 class PolyaxonBenchmark(Benchmark):
 
-    def __init__(self, objective, grid, resources) -> None:
-        self.objective = objective
-        self.grid = grid
+    def __init__(self, resources) -> None:
+        # self.objective = objective
+        # self.grid = grid
         self.resources = resources
         self.group="kubeflow.org"
         self.version="v1beta1"
@@ -36,12 +34,13 @@ def __init__(self, objective, grid, resources) -> None:
         self.experiment_file_name = "grid.yaml"
         self.project_description = "Somer random description"
         self.polyaxon_addr="http://localhost:31833/"
+        self.post_forward_process=False
         
         config.load_kube_config()
 
         self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
 
-        log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=self.logging_level)
+        log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level)
 
     
         if "dockerImageTag" in self.resources:
@@ -190,11 +189,14 @@ def setup(self):
             for line in logs  :
                 log.info(line) 
             
+
+            #TODO check if the image is runing properly and there is no  problems with it
+
             #pushing to repo
             self.client.login(username=self.docker_user, password=self.docker_pasword)
             for line in self.client.images.push(self.trial_tag, stream=True, decode=True):
                 log.info(line) 
-        
+            
         
 
     def run(self):
@@ -218,6 +220,8 @@ def run(self):
         log.info(res)
 
         
+        #TODO check if there is no problem with the image
+        
         #TODO switch to kubernetes api for monitoring runing trials  
         log.info("Waiting for the run to finish:")
         finished = False
@@ -259,11 +263,12 @@ def test(self):
 
     def undeploy(self):
         
-        log.info("Terminating post  forwarding process:")
-        process = psutil.Process(self.post_forward_process.pid)
-        for proc in process.children(recursive=True):
-            proc.kill()
-        process.kill()
+        if(self.post_forward_process):
+            log.info("Terminating post  forwarding process:")
+            process = psutil.Process(self.post_forward_process.pid)
+            for proc in process.children(recursive=True):
+                proc.kill()
+            process.kill()
       
 
 
@@ -308,11 +313,14 @@ def undeploy(self):
             ob = e["object"]
             # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found
             #TODO do this in other way
+
+            
             if ob.metadata.name == self.namespace:
                 try:
                     log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
                 except ApiException as err:
                     log.info(err)
+                    log.info("Namespace sucessfully deleted")
                     w.stop()
                     break
 
@@ -321,16 +329,30 @@ def undeploy(self):
 
 if __name__ == "__main__":
     #main()
-    bench = PolyaxonBenchmark(1,1,resources={
+    # bench = PolyaxonBenchmark(resources={
+    #         "dockerUserLogin":"",
+    #         "dockerUserPassword":"",
+    #     # "studyName":""
+    #     "jobsCount":5,
+    #     "workerCount":5,
+    #     "loggingLevel":log.INFO
+    #     })
+    # bench.deploy() 
+    # bench.setup()
+    # bench.run()
+    # bench.collect_run_results()
+    # bench.undeploy()
+
+    resources={
         #    "dockerUserLogin":"",
         #    "dockerUserPassword":"",
         # "studyName":""
         "jobsCount":5,
         "workerCount":5,
         "loggingLevel":log.INFO
-        })
-    bench.deploy() 
-    bench.setup()
-    bench.run()
-    bench.collect_run_results()
-    bench.undeploy()
+    }
+    from ml_benchmark.benchmark_runner import BenchmarkRunner
+    runner = BenchmarkRunner(
+        benchmark_cls=PolyaxonBenchmark, resources=resources)
+    runner.run()
+
diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py
index a1abe42..242acda 100644
--- a/ml_benchmark/__init__.py
+++ b/ml_benchmark/__init__.py
@@ -1,7 +1,7 @@
 __version__ = "develop"
 install_requires = [
         "scikit-learn==0.24.2", 
-        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==5.0.3",
+        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2",
         "psycopg2-binary"],
 test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"]
 URL = "https://github.com/gebauerm/ml_benchmark"

From a66fad620ce22100c9c45741b27868c35f56e46b Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Thu, 4 Aug 2022 14:40:13 +0200
Subject: [PATCH 4/9] Rerun

---
 .../benchmark_results__2022-08-04_14-28-03__id.json             | 1 +
 experiments/polyaxon_minikube/grid.yaml                         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json

diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json
new file mode 100644
index 0000000..a9558b1
--- /dev/null
+++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json
@@ -0,0 +1 @@
+{"benchmark_metrics": {"latency": [{"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_115109159852", "function_name": "deploy", "start_time": "2022-08-04 14:28:05.734421", "end_time": "2022-08-04 14:29:23.529874", "duration_sec": 77.795453}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_115109159852", "function_name": "setup", "start_time": "2022-08-04 14:29:23.580757", "end_time": "2022-08-04 14:29:23.581278", "duration_sec": 0.000521}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_115109159852", "function_name": "run", "start_time": "2022-08-04 14:29:23.609547", "end_time": "2022-08-04 14:29:48.600260", "duration_sec": 24.990713}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_115109159852", "function_name": "collect_run_results", "start_time": "2022-08-04 14:29:48.624975", "end_time": "2022-08-04 14:29:48.638567", "duration_sec": 0.013592}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_115109159852", "function_name": "test", "start_time": "2022-08-04 14:29:48.661305", "end_time": "2022-08-04 14:29:48.661305", "duration_sec": 0.0}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_115109159852", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:29:48.682667", "end_time": "2022-08-04 14:29:48.682667", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 70d540d..00f6adf 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-15
+name: polyaxon-study-29
 matrix:
   kind: grid
   numRuns: 5

From 36f05eacd2b889ee732d77074e18e730c22b14ef Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 10 Aug 2022 11:37:45 +0200
Subject: [PATCH 5/9] Mnist task docker image is now generated from
 polyaxon_benchmark

---
 .gitignore                                    |  1 +
 .../experiment_template.yaml                  |  9 ++-
 experiments/polyaxon_minikube/grid.yaml       | 13 +++-
 .../polyaxon_minikube/mnist_task/Dockerfile   | 23 +++---
 .../polyaxon_minikube/mnist_task/task.py      | 73 +++++++++++--------
 .../polyaxon_minikube/polyaxon_benchmark.py   | 40 ++++++----
 .../polyaxon_minikube/requirements.txt        |  1 +
 7 files changed, 101 insertions(+), 59 deletions(-)
 create mode 100644 experiments/polyaxon_minikube/requirements.txt

diff --git a/.gitignore b/.gitignore
index 89eb912..757c4e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,5 +131,6 @@ dmypy.json
 exp__*
 experiments/simple_raytune/benchmark__RaytuneBenchmark
 experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
+experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark
 
 data/
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml
index 507a2a5..115774d 100644
--- a/experiments/polyaxon_minikube/experiment_template.yaml
+++ b/experiments/polyaxon_minikube/experiment_template.yaml
@@ -17,5 +17,12 @@ component:
     kind: job
     container:
       image: $worker_image
-      command: [python3, /opt/mnist_task/task.py]
+      command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
+      env:
+        - name: STUDY_NAME
+          value: "$study_name"
+        - name: DB_CONN
+          value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+        - name: "METRICS_STORAGE_HOST"
+          value: "$metrics_ip"
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 00f6adf..a636cd3 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-29
+name: polyaxon-study-3
 matrix:
   kind: grid
   numRuns: 5
@@ -16,6 +16,13 @@ component:
   run:
     kind: job
     container:
-      image: witja46/mnist_polyaxon:latest
-      command: [python3, /opt/mnist_task/task.py]
+      image: witja46/mnist_test
+      command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
+      env:
+        - name: STUDY_NAME
+          value: "polyaxon-study-3"
+        - name: DB_CONN
+          value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+        - name: "METRICS_STORAGE_HOST"
+          value: "134.101.4.161"
diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile
index 1cbfd5b..5ed393d 100644
--- a/experiments/polyaxon_minikube/mnist_task/Dockerfile
+++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile
@@ -1,15 +1,16 @@
-FROM python:3.9-slim
+#TODO change to some slimer base image
+FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
+#FROM python:3.9-slim
 
-ADD . /opt/mnist_task
-WORKDIR /opt/mnist_task
+RUN pip install pip --upgrade
+RUN pip install polyaxon
 
-# Add folder for the logs.
-RUN mkdir /katib
-RUN pip install --no-cache-dir -r requirements.txt
+COPY experiments experiments
+COPY data data
+COPY setup.py setup.py
+COPY ml_benchmark ml_benchmark
 
-RUN chgrp -R 0 /opt/mnist_task \
-  && chmod -R g+rwX /opt/mnist_task \
-  && chgrp -R 0 /katib \
-  && chmod -R g+rwX /katib
 
-ENTRYPOINT ["python3", "/opt/mnist_task/task.py"]
+RUN pip install -e .
+
+CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"]
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
index 7fd4d46..3763745 100644
--- a/experiments/polyaxon_minikube/mnist_task/task.py
+++ b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -3,11 +3,11 @@
 import logging
 import numpy as np
 import time
-# import os
-# PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../.."))
-# sys.path.append(PROJECT_ROOT)
-# from ml_benchmark.benchmark_runner import Benchmark
-# from ml_benchmark.workload.mnist.mnist_task import MnistTask
+import os
+import sys
+PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../"))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.workload.mnist.mnist_task import MnistTask
 
 
 def train(times ,epoch):
@@ -30,9 +30,7 @@ def test():
 
 
 def main():
-    
-    #Polyaxon
-    tracking.init()
+   
     
     #parsing arguments 
     parser = argparse.ArgumentParser(description="MNIST Example")
@@ -43,34 +41,47 @@ def main():
     parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
                         help="learning rate (default: 0.01)")
     args = parser.parse_args()
-  
-    #timestamps format
-    logging.basicConfig(
-            format="%(asctime)s %(levelname)-8s %(message)s",
-            datefmt="%Y-%m-%dT%H:%M:%SZ",
-            level=logging.DEBUG)
-
-   
-    #model taining and testing
     epochs = args.epochs
     batch_size = args.batch_size
     lr = args.lr
-    for epoch in np.arange(1,epochs+1):
-        train(batch_size,epoch)
-        test() 
 
-    # #TODO swicht to the real task 
-    # task = MnistTask(config_init={"epochs": 1})
-    # objective = task.create_objective()
+
+
+    #TODO delete logging?
+    #Some logging
+    # timestamps format
+    # logging.basicConfig(
+    #         format="%(asctime)s %(levelname)-8s %(message)s",
+    #         datefmt="%Y-%m-%dT%H:%M:%SZ",
+    #         level=logging.DEBUG)
+    # for epoch in np.arange(1,epochs+1):
+    #     train(batch_size,epoch)
+    #     test() 
+
+
+
+    #MnistTask
+    task = MnistTask(config_init={"epochs": 1})
+    objective = task.create_objective()
+    #TODO add the weight decay to the definition of the template
+    objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
+    objective.train()
+    validation_scores = objective.validate()
     
-    # objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
-    # objective.train()
-    # validation_scores = objective.validate()
-    # print(validation_scores)
-
-    #polyaxon 
-    tracking.log_metrics(met1=0.8, met2= 0.9)
-    tracking.log_outputs(loss=0.9,accuracy=0.6)
+  
+    #Geting results
+    avg = validation_scores["weighted avg"]
+    print("precision",avg["precision"])
+    print("f1-score",avg["f1-score"])
+
+
+    # Polyaxon
+    #initiating polyaxon tracking
+    tracking.init()    
+    #loging metrics 
+    tracking.log_metrics(recall=avg["recall"], )
+    #logging the results 
+    tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"])
 
 
 if __name__ == "__main__":
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index d8e92ac..4bc1eb8 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -1,5 +1,6 @@
 
 from asyncio import subprocess
+from base64 import decode
 from concurrent.futures import process
 from itertools import count
 import json
@@ -9,6 +10,7 @@
 import sys
 from time import sleep
 import random
+from urllib.request import urlopen
 from kubernetes import client, config,watch
 from kubernetes.client.rest import ApiException
 from string import Template
@@ -42,18 +44,20 @@ def __init__(self, resources) -> None:
 
         log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level)
 
+
+        self.metrics_ip = resources.get("metricsIP")
     
         if "dockerImageTag" in self.resources:
             self.trial_tag = self.resources["dockerImageTag"]
         else:
-            self.trial_tag = "mnist_polyaxon:latest"
+            self.trial_tag = "mnist_test"
 
         if "dockerUserLogin" in self.resources:
             self.docker_user = self.resources["dockerUserLogin"]
-            self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
         else:
-            self.docker_user = ""
-            self.trial_tag = "witja46/mnist_polyaxon:latest"
+            self.docker_user = "witja46"
+    
+        self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
         
         if "dockerUserPassword" in self.resources:
             self.docker_pasword = self.resources["dockerUserPassword"]
@@ -164,7 +168,8 @@ def setup(self):
             "worker_mem": f"{self.workerMemory}Gi",
             "worker_image": self.trial_tag,
             "study_name": self.study_name,
-            "trialParameters":"${trialParameters.learningRate}"
+            "trialParameters":"${trialParameters.learningRate}",
+            "metrics_ip": self.metrics_ip,
         }
 
         #loading and filling the template
@@ -182,9 +187,11 @@ def setup(self):
         if "dockerUserLogin" in self.resources:
            
             #creating task docker image  
-            log.info("Creating task docker image")  
+            log.info("Creating task docker image")            
+            #TODO do all this inside of minikube instead of on the client?  
             self.client = docker.client.from_env()
-            image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag)
+            PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
+            image, logs = self.client.images.build(path=PROJECT_ROOT, dockerfile="experiments/polyaxon_minikube/mnist_task/Dockerfile" ,tag=self.trial_tag)
             log.info(f"Image: {self.trial_tag}")
             for line in logs  :
                 log.info(line) 
@@ -194,7 +201,10 @@ def setup(self):
 
             #pushing to repo
             self.client.login(username=self.docker_user, password=self.docker_pasword)
-            for line in self.client.images.push(self.trial_tag, stream=True, decode=True):
+            
+            log.info("Pushing image to the Dockerhub")
+            
+            for line in self.client.api.push(self.trial_tag,stream=True,decode=True): 
                 log.info(line) 
             
         
@@ -282,6 +292,7 @@ def undeploy(self):
 
 
         # Waiting untill all polyaxon pods get terminated 
+        #TODO add logic in case of no existent polyaxon deployment 
         config.load_kube_config()
         w = watch.Watch()
         c = client.CoreV1Api()
@@ -337,22 +348,25 @@ def undeploy(self):
     #     "workerCount":5,
     #     "loggingLevel":log.INFO
     #     })
-    # bench.deploy() 
+    #bench.deploy() 
     # bench.setup()
     # bench.run()
     # bench.collect_run_results()
-    # bench.undeploy()
+   # bench.undeploy()
 
     resources={
-        #    "dockerUserLogin":"",
-        #    "dockerUserPassword":"",
+        # "dockerUserLogin":"",
+        # "dockerUserPassword":"",
         # "studyName":""
+        "dockerImageTag":"mnist_test",
         "jobsCount":5,
         "workerCount":5,
-        "loggingLevel":log.INFO
+        "loggingLevel":log.INFO,
+        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
     }
     from ml_benchmark.benchmark_runner import BenchmarkRunner
     runner = BenchmarkRunner(
         benchmark_cls=PolyaxonBenchmark, resources=resources)
     runner.run()
 
+
diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt
new file mode 100644
index 0000000..4e272bb
--- /dev/null
+++ b/experiments/polyaxon_minikube/requirements.txt
@@ -0,0 +1 @@
+polyaxon

From f791be277e4ed93646b8eac1576eac5cea892a32 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 16 Aug 2022 16:52:31 +0200
Subject: [PATCH 6/9] 1. Switched from os.popen to invoking polyaxon CLI
 commands with clicks CliRunner. 2. Docker image gets build inside of the
 minikube 3. Double checked that job containers get the ip Address of the
 host. Still the latencies of those do not get recorded.

---
 ...mark_results__2022-08-04_13-00-24__id.json |   1 -
 ...mark_results__2022-08-04_13-06-41__id.json |   1 -
 ...mark_results__2022-08-04_14-06-58__id.json |   1 -
 ...mark_results__2022-08-04_14-28-03__id.json |   1 -
 .../experiment_template.yaml                  |   1 +
 experiments/polyaxon_minikube/grid.yaml       |   7 +-
 .../polyaxon_minikube/mnist_task/Dockerfile   |   1 +
 .../polyaxon_minikube/mnist_task/task.py      |  21 ++-
 .../polyaxon_minikube/polyaxon_benchmark.py   | 148 +++++++++++-------
 .../polyaxon_minikube/requirements.txt        |   1 +
 10 files changed, 108 insertions(+), 75 deletions(-)
 delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
 delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
 delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json
 delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json

diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
deleted file mode 100644
index e898192..0000000
--- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json
+++ /dev/null
@@ -1 +0,0 @@
-{"benchmark_metrics": {"latency": [{"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_145644815288", "function_name": "deploy", "start_time": "2022-08-04 13:00:27.963286", "end_time": "2022-08-04 13:02:02.040017", "duration_sec": 94.076731}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_145644815288", "function_name": "setup", "start_time": "2022-08-04 13:02:02.103506", "end_time": "2022-08-04 13:02:02.108233", "duration_sec": 0.004727}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_145644815288", "function_name": "run", "start_time": "2022-08-04 13:02:02.131942", "end_time": "2022-08-04 13:02:25.957963", "duration_sec": 23.826021}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_145644815288", "function_name": "collect_run_results", "start_time": "2022-08-04 13:02:25.981388", "end_time": "2022-08-04 13:02:25.994767", "duration_sec": 0.013379}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_145644815288", "function_name": "test", "start_time": "2022-08-04 13:02:26.015907", "end_time": "2022-08-04 13:02:26.015907", "duration_sec": 0.0}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_145644815288", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:02:26.035917", "end_time": "2022-08-04 13:02:26.035917", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
deleted file mode 100644
index b14a3fa..0000000
--- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json
+++ /dev/null
@@ -1 +0,0 @@
-{"benchmark_metrics": {"latency": [{"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_133265314672", "function_name": "deploy", "start_time": "2022-08-04 13:06:45.483067", "end_time": "2022-08-04 13:08:15.610367", "duration_sec": 90.1273}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_133265314672", "function_name": "setup", "start_time": "2022-08-04 13:08:15.663572", "end_time": "2022-08-04 13:08:15.664632", "duration_sec": 0.00106}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_133265314672", "function_name": "run", "start_time": "2022-08-04 13:08:15.687857", "end_time": "2022-08-04 13:08:39.550174", "duration_sec": 23.862317}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_133265314672", "function_name": "collect_run_results", "start_time": "2022-08-04 13:08:39.576174", "end_time": "2022-08-04 13:08:39.588927", "duration_sec": 0.012753}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_133265314672", "function_name": "test", "start_time": "2022-08-04 13:08:39.611043", "end_time": "2022-08-04 13:08:39.611043", "duration_sec": 0.0}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_133265314672", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:08:39.633044", "end_time": "2022-08-04 13:08:39.633044", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 50}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json
deleted file mode 100644
index 2de26d8..0000000
--- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json
+++ /dev/null
@@ -1 +0,0 @@
-{"benchmark_metrics": {"latency": [{"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_125399655361", "function_name": "deploy", "start_time": "2022-08-04 14:07:02.174784", "end_time": "2022-08-04 14:08:31.542018", "duration_sec": 89.367234}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_125399655361", "function_name": "setup", "start_time": "2022-08-04 14:08:31.588224", "end_time": "2022-08-04 14:08:36.247162", "duration_sec": 4.658938}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_125399655361", "function_name": "run", "start_time": "2022-08-04 14:08:36.268226", "end_time": "2022-08-04 14:08:59.486441", "duration_sec": 23.218215}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_125399655361", "function_name": "collect_run_results", "start_time": "2022-08-04 14:08:59.507670", "end_time": "2022-08-04 14:08:59.521064", "duration_sec": 0.013394}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_125399655361", "function_name": "test", "start_time": "2022-08-04 14:08:59.543152", "end_time": "2022-08-04 14:08:59.543152", "duration_sec": 0.0}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_125399655361", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:08:59.562769", "end_time": "2022-08-04 14:08:59.562769", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"dockerUserLogin": "witja46", "dockerUserPassword": "J$rmakowicz1998", "jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json
deleted file mode 100644
index a9558b1..0000000
--- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json
+++ /dev/null
@@ -1 +0,0 @@
-{"benchmark_metrics": {"latency": [{"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_115109159852", "function_name": "deploy", "start_time": "2022-08-04 14:28:05.734421", "end_time": "2022-08-04 14:29:23.529874", "duration_sec": 77.795453}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_115109159852", "function_name": "setup", "start_time": "2022-08-04 14:29:23.580757", "end_time": "2022-08-04 14:29:23.581278", "duration_sec": 0.000521}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_115109159852", "function_name": "run", "start_time": "2022-08-04 14:29:23.609547", "end_time": "2022-08-04 14:29:48.600260", "duration_sec": 24.990713}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_115109159852", "function_name": "collect_run_results", "start_time": "2022-08-04 14:29:48.624975", "end_time": "2022-08-04 14:29:48.638567", "duration_sec": 0.013592}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_115109159852", "function_name": "test", "start_time": "2022-08-04 14:29:48.661305", "end_time": "2022-08-04 14:29:48.661305", "duration_sec": 0.0}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_115109159852", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:29:48.682667", "end_time": "2022-08-04 14:29:48.682667", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}}
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml
index 115774d..703895f 100644
--- a/experiments/polyaxon_minikube/experiment_template.yaml
+++ b/experiments/polyaxon_minikube/experiment_template.yaml
@@ -16,6 +16,7 @@ component:
   run:
     kind: job
     container:
+      imagePullPolicy: IfNotPresent
       image: $worker_image
       command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index a636cd3..3665986 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-3
+name: polyaxon-study-50
 matrix:
   kind: grid
   numRuns: 5
@@ -16,12 +16,13 @@ component:
   run:
     kind: job
     container:
-      image: witja46/mnist_test
+      imagePullPolicy: IfNotPresent
+      image: mnist_test
       command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
       env:
         - name: STUDY_NAME
-          value: "polyaxon-study-3"
+          value: "polyaxon-study-50"
         - name: DB_CONN
           value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
         - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile
index 5ed393d..1ef3f09 100644
--- a/experiments/polyaxon_minikube/mnist_task/Dockerfile
+++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile
@@ -11,6 +11,7 @@ COPY setup.py setup.py
 COPY ml_benchmark ml_benchmark
 
 
+
 RUN pip install -e .
 
 CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"]
diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py
index 3763745..6853d47 100644
--- a/experiments/polyaxon_minikube/mnist_task/task.py
+++ b/experiments/polyaxon_minikube/mnist_task/task.py
@@ -14,7 +14,7 @@ def train(times ,epoch):
     loss = 1
     for x in np.arange(1,times + 1): 
         loss = 1 - (x -1 ) / times
-        time.sleep(0.01)
+        time.sleep(0.1)
         msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                 epoch, x , times,
                 100. * x / times, loss)
@@ -47,21 +47,11 @@ def main():
 
 
 
-    #TODO delete logging?
-    #Some logging
-    # timestamps format
-    # logging.basicConfig(
-    #         format="%(asctime)s %(levelname)-8s %(message)s",
-    #         datefmt="%Y-%m-%dT%H:%M:%SZ",
-    #         level=logging.DEBUG)
-    # for epoch in np.arange(1,epochs+1):
-    #     train(batch_size,epoch)
-    #     test() 
 
 
 
     #MnistTask
-    task = MnistTask(config_init={"epochs": 1})
+    task = MnistTask(config_init={"epochs": epochs})
     objective = task.create_objective()
     #TODO add the weight decay to the definition of the template
     objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
@@ -74,6 +64,13 @@ def main():
     print("precision",avg["precision"])
     print("f1-score",avg["f1-score"])
 
+    # logging.basicConfig(
+    #         format="%(asctime)s %(levelname)-8s %(message)s",
+    #         datefmt="%Y-%m-%dT%H:%M:%SZ",
+    #         level=logging.DEBUG)
+    # for epoch in np.arange(1,epochs+1):
+    #     train(batch_size,epoch)
+    #     test() 
 
     # Polyaxon
     #initiating polyaxon tracking
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index 4bc1eb8..57b3a8b 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -1,6 +1,8 @@
 
+from __future__ import print_function
 from asyncio import subprocess
 from base64 import decode
+from cmath import pi
 from concurrent.futures import process
 from itertools import count
 import json
@@ -11,15 +13,22 @@
 from time import sleep
 import random
 from urllib.request import urlopen
+from venv import create
 from kubernetes import client, config,watch
 from kubernetes.client.rest import ApiException
 from string import Template
 import docker
 from ml_benchmark.benchmark_runner import Benchmark
+from ml_benchmark.utils.image_build_wrapper import builder_from_string
 import requests
 import subprocess
 import psutil
 import logging as log
+from polyaxon.cli.projects import create
+from polyaxon.cli.run import run
+from polyaxon.cli.admin import deploy,teardown
+from click.testing import CliRunner
+
 
 
 
@@ -37,7 +46,8 @@ def __init__(self, resources) -> None:
         self.project_description = "Somer random description"
         self.polyaxon_addr="http://localhost:31833/"
         self.post_forward_process=False
-        
+        self.cli_runner=CliRunner()
+
         config.load_kube_config()
 
         self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
@@ -57,7 +67,7 @@ def __init__(self, resources) -> None:
         else:
             self.docker_user = "witja46"
     
-        self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
+        
         
         if "dockerUserPassword" in self.resources:
             self.docker_pasword = self.resources["dockerUserPassword"]
@@ -101,10 +111,10 @@ def deploy(self):
         res = os.popen('helm repo add polyaxon https://charts.polyaxon.com').read()
         log.info(res)
 
-        #TODO deploy via helm for better error handling and easier configuration?
         log.info("Deploying polyaxon to minikube:")
-        res = os.popen('polyaxon admin deploy -t minikube').read()
-        log.info(res)
+        #invoking polyaxon cli deploy comand
+        res = self.cli_runner.invoke(deploy)
+        log.info(res.output)
 
       
       
@@ -113,11 +123,10 @@ def deploy(self):
         config.load_kube_config()
         w = watch.Watch()
         c = client.CoreV1Api()
-        # c = client.AppsV1Api()
         deployed = 0
 
 
-        log.info("Waiting for polyaxon pods to be read:")
+        log.info("Waiting for all polyaxon pods to be ready:")
         # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments 
         monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
         # TODO changing to list_namespaced_deployments?
@@ -183,54 +192,44 @@ def setup(self):
         log.info("Experiment yaml created")
       
       
-        # Creating new docker image if credentials were passed
-        if "dockerUserLogin" in self.resources:
-           
-            #creating task docker image  
-            log.info("Creating task docker image")            
-            #TODO do all this inside of minikube instead of on the client?  
-            self.client = docker.client.from_env()
-            PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
-            image, logs = self.client.images.build(path=PROJECT_ROOT, dockerfile="experiments/polyaxon_minikube/mnist_task/Dockerfile" ,tag=self.trial_tag)
-            log.info(f"Image: {self.trial_tag}")
-            for line in logs  :
-                log.info(line) 
-            
+        log.info("Creating task docker image")   
+        #creating docker image inside of the minikube   
+        self.image_builder = builder_from_string("minikube")()
+        PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
+        self.image_builder.deploy_image(
+        "experiments/polyaxon_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
+        print(f"Image: {self.trial_tag}")
 
-            #TODO check if the image is runing properly and there is no  problems with it
 
-            #pushing to repo
-            self.client.login(username=self.docker_user, password=self.docker_pasword)
-            
-            log.info("Pushing image to the Dockerhub")
-            
-            for line in self.client.api.push(self.trial_tag,stream=True,decode=True): 
-                log.info(line) 
+        
             
         
 
     def run(self):
 
-        #TODO add error handling 
+        #TODO add error handling.
+        #TODO change to invocking procject comand instead of sending the http request to the polaxon api? 
         log.info("Creating new project:")
-        project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, "description": self.project_description})
-        log.info(project.text)
+        # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, })
+
+         
+        options = f'--name {self.study_name} --description '.split()
+        # adding the project description as the last argument  
+        options.append(f'{self.project_description}')
+        #invoking polyaxon project create comand
+        res = self.cli_runner.invoke(create,options)
+        log.info(res.output)
 
 
-        #TODO find out where to pass the --eager flag so that the run can be created with help of api
-        # with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f:
-        #     body = f.read()
-        #     log.info(body)
-        #     run = requests.post(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs',json={"content":body,"eager":True ,"tags":["--eager"],"meta_info":{"eager":True,"--eager":True,"flags":"--eager"}})
-        #     log.info(run.text)
       
       
         log.info("Starting polyaxon experiment:")
-        res = os.popen(f'polyaxon run -f ./{self.experiment_file_name} --project {self.study_name} --eager').read()
-        log.info(res)
+        #invoking polyaxon run comand with following options
+        options = f'-f ./{self.experiment_file_name} --project {self.study_name} --eager'.split()
+        res = self.cli_runner.invoke(run,options)
+        log.info(res.output)
 
         
-        #TODO check if there is no problem with the image
         
         #TODO switch to kubernetes api for monitoring runing trials  
         log.info("Waiting for the run to finish:")
@@ -242,12 +241,19 @@ def run(self):
             #checking if all runs were finished
             finished = runs["count"] == self.jobsCount
             sleep(1)
-        return 
+    
+        return
+
+
 
 
     def collect_benchmark_metrics(self):
-        pass
 
+        log.info("Collecting run results:")
+        result = self.get_succeeded_runs()
+        log.info(json.dumps(result,indent=4))               
+               
+        return result["results"]
 
 
     def get_succeeded_runs(self, sort_by="duration"):
@@ -279,14 +285,20 @@ def undeploy(self):
             for proc in process.children(recursive=True):
                 proc.kill()
             process.kill()
-      
 
 
-        #TODO changing to undeploying with helm?
+
         log.info("Undeploying polyaxon:")
-        p = subprocess.Popen(["polyaxon", "admin" ,"teardown"],shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
-        out,err = p.communicate(input=b"y") 
-        p.terminate()
+        res = self.cli_runner.invoke(teardown,["--yes"])
+        #by teardown comand the polyaxon cli doesnt set exit_code if there are some problems
+        if("Polyaxon could not teardown the deployment" in res.output):
+            raise Exception(f'Exit code: {res.exit_code}  Error message: \n{res.output}')
+        elif(res.exit_code == 0):
+            print(res.exit_code)
+            log.info(res.output)
+        else:
+            raise Exception(f'Exit code: {res.exit_code}  Error message: \n{res.output}')
+   
              
     
 
@@ -341,23 +353,28 @@ def undeploy(self):
 if __name__ == "__main__":
     #main()
     # bench = PolyaxonBenchmark(resources={
-    #         "dockerUserLogin":"",
-    #         "dockerUserPassword":"",
+    #         # "dockerUserLogin":"",
+    #         # "dockerUserPassword":"",
     #     # "studyName":""
     #     "jobsCount":5,
     #     "workerCount":5,
-    #     "loggingLevel":log.INFO
+    #     "loggingLevel":log.INFO,
+    #     "dockerImageTag":"nowe",
+
+    #     "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
     #     })
-    #bench.deploy() 
+    
+    # bench.deploy() 
     # bench.setup()
     # bench.run()
-    # bench.collect_run_results()
-   # bench.undeploy()
+    # # bench.collect_run_results()
+
+    # bench.undeploy()
+    # polyaxon config set --host=http://localhost:8000
 
+    
     resources={
-        # "dockerUserLogin":"",
-        # "dockerUserPassword":"",
-        # "studyName":""
+        # "studyName":"",
         "dockerImageTag":"mnist_test",
         "jobsCount":5,
         "workerCount":5,
@@ -369,4 +386,23 @@ def undeploy(self):
         benchmark_cls=PolyaxonBenchmark, resources=resources)
     runner.run()
 
+    
+    # print(f'polyaxon run -f ./ --project  --eager')
+    # print(f'polyaxon run -f ./grid --project --eager'.split())
+    # runner = CliRunner()
+    # print("start")
+    # res = runner.invoke(run,["-f ./grid.yaml --eager -o json"])
+    # print("fin")
+    # print(res.output,res.exit_code)
+
+    # res = runner.invoke(teardown,["--yes"])    
+    # if("Polyaxon could not teardown the deployment" in res.output):
+    #     print("Ja pierdole")
+    # print(res.output,res.exit_code,res.exception)
+    # print("stop")
+    # res = runner.invoke(deploy)    
+    # print(res.output,res.exit_code)
+
+   # print(res)
+
 
diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt
index 4e272bb..a1c3b00 100644
--- a/experiments/polyaxon_minikube/requirements.txt
+++ b/experiments/polyaxon_minikube/requirements.txt
@@ -1 +1,2 @@
 polyaxon
+kubernetes

From bfeb739f36c10d522ce7274e03b411aae07b4053 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Mon, 22 Aug 2022 12:27:17 +0200
Subject: [PATCH 7/9] Added tracking of Undeploy, added lighter image for
 faster testing

---
 .../experiment_template.yaml                  |  2 +-
 experiments/polyaxon_minikube/grid.yaml       |  8 ++---
 .../polyaxon_minikube/mnist_task/Dockerfile   |  2 +-
 .../polyaxon_minikube/polyaxon_benchmark.py   | 30 ++++++++++++-------
 .../polyaxon_minikube/requirements.txt        |  1 +
 ml_benchmark/benchmark_runner.py              |  2 +-
 6 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml
index 703895f..6db4ece 100644
--- a/experiments/polyaxon_minikube/experiment_template.yaml
+++ b/experiments/polyaxon_minikube/experiment_template.yaml
@@ -18,7 +18,7 @@ component:
     container:
       imagePullPolicy: IfNotPresent
       image: $worker_image
-      command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
+      command: [python3, experiments/polyaxon_minikube/$worker_image/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
       env:
         - name: STUDY_NAME
diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 3665986..70606bf 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-50
+name: polyaxon-study-75
 matrix:
   kind: grid
   numRuns: 5
@@ -17,12 +17,12 @@ component:
     kind: job
     container:
       imagePullPolicy: IfNotPresent
-      image: mnist_test
-      command: [python3, experiments/polyaxon_minikube/mnist_task/task.py]
+      image: task_light
+      command: [python3, experiments/polyaxon_minikube/task_light/task.py]
       args: ["--batch-size=64", "--lr={{ lr }}"]
       env:
         - name: STUDY_NAME
-          value: "polyaxon-study-50"
+          value: "polyaxon-study-75"
         - name: DB_CONN
           value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
         - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile
index 1ef3f09..b4701bb 100644
--- a/experiments/polyaxon_minikube/mnist_task/Dockerfile
+++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile
@@ -6,7 +6,7 @@ RUN pip install pip --upgrade
 RUN pip install polyaxon
 
 COPY experiments experiments
-COPY data data
+#COPY data data
 COPY setup.py setup.py
 COPY ml_benchmark ml_benchmark
 
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index 57b3a8b..cab7487 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -52,6 +52,7 @@ def __init__(self, resources) -> None:
 
         self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
 
+        self.create_clean_image = self.resources.get("createCleanImage",True) 
         log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level)
 
 
@@ -60,7 +61,7 @@ def __init__(self, resources) -> None:
         if "dockerImageTag" in self.resources:
             self.trial_tag = self.resources["dockerImageTag"]
         else:
-            self.trial_tag = "mnist_test"
+            self.trial_tag = "mnist_task"
 
         if "dockerUserLogin" in self.resources:
             self.docker_user = self.resources["dockerUserLogin"]
@@ -191,14 +192,16 @@ def setup(self):
             f.write(job_yml_objects)
         log.info("Experiment yaml created")
       
-      
-        log.info("Creating task docker image")   
-        #creating docker image inside of the minikube   
-        self.image_builder = builder_from_string("minikube")()
-        PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
-        self.image_builder.deploy_image(
-        "experiments/polyaxon_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
-        print(f"Image: {self.trial_tag}")
+        
+        if self.create_clean_image:
+            log.info("Creating task docker image")   
+            #creating docker image inside of the minikube   
+            self.image_builder = builder_from_string("minikube")()
+            PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
+            log.info(PROJECT_ROOT)
+            self.image_builder.deploy_image(
+            f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT)
+            print(f"Image: {self.trial_tag}")
 
 
         
@@ -347,6 +350,9 @@ def undeploy(self):
                     w.stop()
                     break
 
+
+        log.info("Deleting image from minikube")
+        self.image_builder.cleanup(self.trial_tag)
         log.info("Finished undeploying")
 
 
@@ -375,11 +381,13 @@ def undeploy(self):
     
     resources={
         # "studyName":"",
-        "dockerImageTag":"mnist_test",
+        "dockerImageTag":"task_light",
         "jobsCount":5,
+        
         "workerCount":5,
         "loggingLevel":log.INFO,
-        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
+        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
+        "createCleanImage":True
     }
     from ml_benchmark.benchmark_runner import BenchmarkRunner
     runner = BenchmarkRunner(
diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt
index a1c3b00..31e30f0 100644
--- a/experiments/polyaxon_minikube/requirements.txt
+++ b/experiments/polyaxon_minikube/requirements.txt
@@ -1,2 +1,3 @@
 polyaxon
 kubernetes
+torch
\ No newline at end of file
diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py
index d290b0b..8bd0d25 100644
--- a/ml_benchmark/benchmark_runner.py
+++ b/ml_benchmark/benchmark_runner.py
@@ -142,7 +142,7 @@ def run(self):
         run_process = [
             self.benchmark.deploy, self.benchmark.setup, self.benchmark.run,
             self.benchmark.collect_run_results,
-            self.benchmark.test, self.benchmark.collect_benchmark_metrics]
+            self.benchmark.test, self.benchmark.collect_benchmark_metrics,self.benchmark.undeploy]
         benchmark_results = None
 
         try:

From 8c43ae530a6d5c606516747f3773681c6fb98074 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Mon, 22 Aug 2022 12:53:06 +0200
Subject: [PATCH 8/9] Throwing  exeption in setup phase  if task image was not
 created successfully

---
 experiments/polyaxon_minikube/grid.yaml       |  4 +-
 .../polyaxon_minikube/polyaxon_benchmark.py   | 26 ++++---
 .../polyaxon_minikube/task_light/Dockerfile   | 16 +++++
 .../polyaxon_minikube/task_light/README.md    | 11 +++
 .../task_light/requirements.txt               |  2 +
 .../polyaxon_minikube/task_light/task.py      | 69 +++++++++++++++++++
 ml_benchmark/utils/image_build_wrapper.py     |  2 +-
 7 files changed, 116 insertions(+), 14 deletions(-)
 create mode 100644 experiments/polyaxon_minikube/task_light/Dockerfile
 create mode 100644 experiments/polyaxon_minikube/task_light/README.md
 create mode 100644 experiments/polyaxon_minikube/task_light/requirements.txt
 create mode 100644 experiments/polyaxon_minikube/task_light/task.py

diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 70606bf..0e4071e 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,6 +1,6 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-75
+name: polyaxon-study-6
 matrix:
   kind: grid
   numRuns: 5
@@ -22,7 +22,7 @@ component:
       args: ["--batch-size=64", "--lr={{ lr }}"]
       env:
         - name: STUDY_NAME
-          value: "polyaxon-study-75"
+          value: "polyaxon-study-6"
         - name: DB_CONN
           value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
         - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index cab7487..0ddb940 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -199,8 +199,11 @@ def setup(self):
             self.image_builder = builder_from_string("minikube")()
             PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
             log.info(PROJECT_ROOT)
-            self.image_builder.deploy_image(
+            res = self.image_builder.deploy_image(
             f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT)
+            print(res)
+            if f'Successfully tagged {self.trial_tag}' not in res:
+                raise Exception("Image was not created:",res)
             print(f"Image: {self.trial_tag}")
 
 
@@ -352,7 +355,7 @@ def undeploy(self):
 
 
         log.info("Deleting image from minikube")
-        self.image_builder.cleanup(self.trial_tag)
+        #self.image_builder.cleanup(self.trial_tag)
         log.info("Finished undeploying")
 
 
@@ -370,12 +373,6 @@ def undeploy(self):
     #     "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
     #     })
     
-    # bench.deploy() 
-    # bench.setup()
-    # bench.run()
-    # # bench.collect_run_results()
-
-    # bench.undeploy()
     # polyaxon config set --host=http://localhost:8000
 
     
@@ -390,10 +387,17 @@ def undeploy(self):
         "createCleanImage":True
     }
     from ml_benchmark.benchmark_runner import BenchmarkRunner
-    runner = BenchmarkRunner(
-        benchmark_cls=PolyaxonBenchmark, resources=resources)
-    runner.run()
+    #runner = BenchmarkRunner(
+    #    benchmark_cls=PolyaxonBenchmark, resources=resources)
+    #runner.run()
 
+    bench= PolyaxonBenchmark(resources=resources)
+    #bench.deploy() 
+    bench.setup()
+    # bench.run()
+    # # bench.collect_run_results()
+
+    # bench.undeploy()
     
     # print(f'polyaxon run -f ./ --project  --eager')
     # print(f'polyaxon run -f ./grid --project --eager'.split())
diff --git a/experiments/polyaxon_minikube/task_light/Dockerfile b/experiments/polyaxon_minikube/task_light/Dockerfile
new file mode 100644
index 0000000..f3cf347
--- /dev/null
+++ b/experiments/polyaxon_minikube/task_light/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.9-slim
+
+RUN pip install pip --upgrade
+RUN pip install polyaxon
+
+
+COPY experiments experiments
+# COPY data data
+COPY setup.py setup.py
+COPY ml_benchmark ml_benchmark
+
+
+
+RUN pip install -e .
+
+CMD ["python", "experiments/katib_minikube/task_light/task.py"]
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/task_light/README.md b/experiments/polyaxon_minikube/task_light/README.md
new file mode 100644
index 0000000..3326bc7
--- /dev/null
+++ b/experiments/polyaxon_minikube/task_light/README.md
@@ -0,0 +1,11 @@
+# PyTorch MNIST Image Classification Example
+
+This is PyTorch MNIST image classification training container with saving metrics
+to the file or printing to the StdOut. It uses convolutional neural network to
+train the model.
+
+Katib uses this training container in some Experiments, for instance in the
+[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
+the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
+the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
+and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
diff --git a/experiments/polyaxon_minikube/task_light/requirements.txt b/experiments/polyaxon_minikube/task_light/requirements.txt
new file mode 100644
index 0000000..9852601
--- /dev/null
+++ b/experiments/polyaxon_minikube/task_light/requirements.txt
@@ -0,0 +1,2 @@
+Pillow>=9.1.1
+numpy
\ No newline at end of file
diff --git a/experiments/polyaxon_minikube/task_light/task.py b/experiments/polyaxon_minikube/task_light/task.py
new file mode 100644
index 0000000..0f60460
--- /dev/null
+++ b/experiments/polyaxon_minikube/task_light/task.py
@@ -0,0 +1,69 @@
+from __future__ import print_function
+
+from polyaxon import tracking
+import argparse
+import logging
+import os
+import numpy as np
+import time
+from ml_benchmark.latency_tracker import latency_decorator
+
+@latency_decorator
+def train(times ,epochs):
+    loss = 1
+    for epoch in range(epochs):
+        for x in np.arange(1,times + 1): 
+            loss = 1 - (x -1 ) / times
+            time.sleep(0.01)
+            msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                    epoch, x , times,
+                    100. * x / times, loss)
+            logging.info(msg)
+
+
+def test():
+
+    test_accuracy =0.8 
+    logging.info("Validation-accuracy={:.4f}\n".format(
+        test_accuracy))
+
+
+def main():
+    
+    #parsing arguments 
+    parser = argparse.ArgumentParser(description="MNIST Example")
+    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="number of epochs to train (default: 10)")
+    parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
+                        help="learning rate (default: 0.01)")
+    args = parser.parse_args()
+  
+    #timestamps format
+    logging.basicConfig(
+            format="%(asctime)s %(levelname)-8s %(message)s",
+            datefmt="%Y-%m-%dT%H:%M:%SZ",
+            level=logging.DEBUG)
+
+   
+    #model taining and testing
+    epochs = args.epochs
+    batch_size = args.batch_size
+    lr = args.lr
+    
+    train(batch_size,epochs)
+    test()
+    avg = {"recall":0.5,"f1-score":0.98,"precision":0.7} 
+    # Polyaxon
+    #initiating polyaxon tracking
+    tracking.init()    
+    #loging metrics 
+    tracking.log_metrics(recall=avg["recall"], )
+    #logging the results 
+    tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"]) 
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ml_benchmark/utils/image_build_wrapper.py b/ml_benchmark/utils/image_build_wrapper.py
index e88b366..9c6990c 100644
--- a/ml_benchmark/utils/image_build_wrapper.py
+++ b/ml_benchmark/utils/image_build_wrapper.py
@@ -35,7 +35,7 @@ def deploy_image(self, image, tag, build_context):
         if call.returncode != 0:
             print(call.stderr.decode("utf-8").strip("\n"))
             raise Exception("Failed to deploy image")
-        print("IMAGE IMAGE ", call.stdout, call.stderr)
+#        print("IMAGE IMAGE ", call.stdout, call.stderr)
 
         return call.stdout.decode("utf-8").strip("\n")
 

From 86fb3e882c3c47e3199b53bf21ac45f70c25fc4a Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 30 Aug 2022 00:00:17 +0200
Subject: [PATCH 9/9] Small clean up. Chaneged to monitoring runing trials with
 kubernetes api instead of the polyaxon api

---
 experiments/polyaxon_minikube/grid.yaml       |   8 +-
 .../polyaxon_minikube/polyaxon_benchmark.py   | 167 +++++++++---------
 2 files changed, 83 insertions(+), 92 deletions(-)

diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml
index 0e4071e..a54550f 100644
--- a/experiments/polyaxon_minikube/grid.yaml
+++ b/experiments/polyaxon_minikube/grid.yaml
@@ -1,14 +1,14 @@
 version: 1.1
 kind: operation
-name: polyaxon-study-6
+name: polyaxon-study-57
 matrix:
   kind: grid
-  numRuns: 5
+  numRuns: 15
   concurrency: 5
   params:
     lr:
       kind: logspace
-      value: 0.01:0.1:5
+      value: 0.01:0.1:15
 component:
   inputs:
     - name: lr
@@ -22,7 +22,7 @@ component:
       args: ["--batch-size=64", "--lr={{ lr }}"]
       env:
         - name: STUDY_NAME
-          value: "polyaxon-study-6"
+          value: "polyaxon-study-57"
         - name: DB_CONN
           value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
         - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py
index 0ddb940..8591ad9 100644
--- a/experiments/polyaxon_minikube/polyaxon_benchmark.py
+++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py
@@ -4,10 +4,12 @@
 from base64 import decode
 from cmath import pi
 from concurrent.futures import process
+from importlib.abc import ResourceReader
 from itertools import count
 import json
 from os import path 
 import os
+import re
 from socket import timeout
 import sys
 from time import sleep
@@ -50,56 +52,18 @@ def __init__(self, resources) -> None:
 
         config.load_kube_config()
 
-        self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
-
+        self.clean_up  = self.resources.get("cleanUp",True)
         self.create_clean_image = self.resources.get("createCleanImage",True) 
-        log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level)
-
-
         self.metrics_ip = resources.get("metricsIP")
-    
-        if "dockerImageTag" in self.resources:
-            self.trial_tag = self.resources["dockerImageTag"]
-        else:
-            self.trial_tag = "mnist_task"
-
-        if "dockerUserLogin" in self.resources:
-            self.docker_user = self.resources["dockerUserLogin"]
-        else:
-            self.docker_user = "witja46"
-    
-        
+        self.trial_tag = resources.get("dockerImageTag", "mnist_task")
+        self.study_name = resources.get("studyName",f'polyaxon-study-{random.randint(0, 100)}')
+        self.workerCpu=resources.get("workerCpu",2)
+        self.workerMemory=resources.get("workerMemory",2)
+        self.workerCount=resources.get("workerCount",5)
+        self.jobsCount=resources.get("jobsCount",6) 
         
-        if "dockerUserPassword" in self.resources:
-            self.docker_pasword = self.resources["dockerUserPassword"]
-        else:
-            self.docker_pasword = ""        
-
-
-        if "studyName" in self.resources:
-            self.study_name = self.resources["studyName"]
-        else:
-            self.study_name = f"polyaxon-study-{random.randint(0, 100)}"
-
-        if "workerCpu" in self.resources:
-            self.workerCpu = self.resources["workerCpu"]
-        else:
-            self.workerCpu = 2
-
-        if "workerMemory" in resources:
-            self.workerMemory = self.resources["workerMemory"]
-        else:
-            self.workerMemory = 2
-
-        if "workerCount" in resources:
-            self.workerCount = self.resources["workerCount"]
-        else:
-            self.workerCount = 5
-
-        if "jobsCount" in resources:
-            self.jobsCount = self.resources["jobsCount"]
-        else:
-            self.jobsCount = 6
+        self.logging_level= self.resources.get("loggingLevel",log.CRITICAL)
+        log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level)
         
     def deploy(self):
         """
@@ -121,32 +85,30 @@ def deploy(self):
       
         
 
-        config.load_kube_config()
-        w = watch.Watch()
-        c = client.CoreV1Api()
-        deployed = 0
 
 
         log.info("Waiting for all polyaxon pods to be ready:")
+        config.load_kube_config()
+        w = watch.Watch()
+        c = client.CoreV1Api()
+        
         # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments 
-        monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
-        # TODO changing to list_namespaced_deployments?
+        unready_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
         for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"):
             ob = e["object"]          
             
-            for name in monitored_pods:
+            for name in unready_pods:
 
                 #checking if it is one of the pods that we want to monitor 
                 if name in ob.metadata.name:
                     
-                    # Checking if the pod already is runnig and its underlying containers are ready
+                    # Checking if the pod already is runnig and its underlying containers are ready, if yes we do not need to monitor it anymore
                     if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: 
                         log.info(f'{ob.metadata.name} is ready')
-                        monitored_pods.remove(name)
-                        deployed = deployed + 1
+                        unready_pods.remove(name)
 
-                        #if all monitored pods are running the deployment process was ended
-                        if(deployed == 4 ):
+                        #if all monitored pods are ready  the deployment process was ended
+                        if not unready_pods:
                             w.stop()
                             log.info("Finished deploying crucial pods")
                             
@@ -195,16 +157,18 @@ def setup(self):
         
         if self.create_clean_image:
             log.info("Creating task docker image")   
-            #creating docker image inside of the minikube   
             self.image_builder = builder_from_string("minikube")()
             PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
-            log.info(PROJECT_ROOT)
+
+            #creating docker image inside of the minikube   
             res = self.image_builder.deploy_image(
             f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT)
-            print(res)
+            
+            log.info(res)
+            #if something went wrong by creation of the image benchmark schould be stoped
             if f'Successfully tagged {self.trial_tag}' not in res:
                 raise Exception("Image was not created:",res)
-            print(f"Image: {self.trial_tag}")
+            log.info(f"Image: {self.trial_tag}")
 
 
         
@@ -213,16 +177,16 @@ def setup(self):
 
     def run(self):
 
-        #TODO add error handling.
-        #TODO change to invocking procject comand instead of sending the http request to the polaxon api? 
-        log.info("Creating new project:")
-        # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, })
+        # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, }) # Alternative way of creating the project crd with http request to the polyaxon api
 
          
+        log.info("Creating new project:")
         options = f'--name {self.study_name} --description '.split()
         # adding the project description as the last argument  
         options.append(f'{self.project_description}')
+    
         #invoking polyaxon project create comand
+        #TODO add error handling.
         res = self.cli_runner.invoke(create,options)
         log.info(res.output)
 
@@ -231,7 +195,8 @@ def run(self):
       
         log.info("Starting polyaxon experiment:")
         #invoking polyaxon run comand with following options
-        options = f'-f ./{self.experiment_file_name} --project {self.study_name} --eager'.split()
+        options = f'-f {self.experiment_file_name} --project {self.study_name} --eager'.split()
+        #TODO add error handling.
         res = self.cli_runner.invoke(run,options)
         log.info(res.output)
 
@@ -240,13 +205,29 @@ def run(self):
         #TODO switch to kubernetes api for monitoring runing trials  
         log.info("Waiting for the run to finish:")
         finished = False
-        while not finished:
-            runs = self.get_succeeded_runs()
-            log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded')
+        
+        w = watch.Watch()
+        c = client.BatchV1Api()
+        done = 0
+        for e in w.stream(c.list_namespaced_job, namespace=self.namespace):
+            if "object" in e and e["object"].status.completion_time is not None:
+                done = done + 1 
+                log.info(f'{done} jobs out of {self.jobsCount} succeded')
+                if(done == self.jobsCount):
+                    log.info("Finished all runs")
+                    w.stop()
+
+
+
+                
+                
+        # while not finished:
+        #     runs = self.get_succeeded_runs()
+        #     log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded')
             
-            #checking if all runs were finished
-            finished = runs["count"] == self.jobsCount
-            sleep(1)
+        #     #checking if all runs were finished
+        #     finished = runs["count"] == self.jobsCount
+        #     sleep(1)
     
         return
 
@@ -315,6 +296,7 @@ def undeploy(self):
         w = watch.Watch()
         c = client.CoreV1Api()
         deployed = 0
+        to_undeploy= ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"]
         log.info("Waiting for polyaxon pods to be terminated:")
         for e in w.stream(c.list_namespaced_pod, namespace=self.namespace):
             ob = e["object"]
@@ -322,14 +304,17 @@ def undeploy(self):
             log.debug(f'{deployed} pods out of 4 were killed')
             log.debug("\n new in stream:\n")
             log.debug(ob.metadata.name,ob.status.phase)
+            for name in to_undeploy:
+                if name in ob.metadata.name:
 
-            if not ob.status.container_statuses[0].ready:
-                log.info(f'Containers of {ob.metadata.name} are terminated')
-                deployed = deployed + 1
-                if(deployed == 4 ):
-                    w.stop()
-                    # log.info("Finished ")
-                    break
+                    if not ob.status.container_statuses[0].ready:
+                        log.info(f'Containers of {ob.metadata.name} are terminated')
+                        to_undeploy.remove(name)
+                        
+                        if not to_undeploy:
+                            w.stop()
+                            # log.info("Finished ")
+                            break
         
       
         log.info("Killed all pods deleteing the namespace:")
@@ -350,6 +335,10 @@ def undeploy(self):
                 except ApiException as err:
                     log.info(err)
                     log.info("Namespace sucessfully deleted")
+                    if self.clean_up:
+                        log.info("Deleteing task docker image from minikube")
+                        sleep(2)
+                        self.image_builder.cleanup(self.trial_tag)
                     w.stop()
                     break
 
@@ -379,21 +368,22 @@ def undeploy(self):
     resources={
         # "studyName":"",
         "dockerImageTag":"task_light",
-        "jobsCount":5,
-        
+        "jobsCount":15,
+       "cleanUp":False,
+"createCleanImage":False,
         "workerCount":5,
         "loggingLevel":log.INFO,
         "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
         "createCleanImage":True
     }
     from ml_benchmark.benchmark_runner import BenchmarkRunner
-    #runner = BenchmarkRunner(
-    #    benchmark_cls=PolyaxonBenchmark, resources=resources)
-    #runner.run()
+    runner = BenchmarkRunner(
+        benchmark_cls=PolyaxonBenchmark, resources=resources)
+    runner.run()
 
-    bench= PolyaxonBenchmark(resources=resources)
+    #bench= PolyaxonBenchmark(resources=resources)
     #bench.deploy() 
-    bench.setup()
+    #bench.setup()
     # bench.run()
     # # bench.collect_run_results()
 
@@ -418,3 +408,4 @@ def undeploy(self):
    # print(res)
 
 
+