From 1b3d5b87c9c908e3e8f7f04f4c0c61d872fefd62 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 16 Aug 2022 17:29:54 +0200
Subject: [PATCH 1/9] First Katib script version

---
 .../katib_benchmark/experiment_template.yaml  |  44 ++++
 experiments/katib_benchmark/grid.yaml         |  44 ++++
 .../katib_benchmark/katib_benchmark.py        | 245 ++++++++++++++++++
 .../katib_benchmark/mnist_task/Dockerfile     |  15 ++
 .../katib_benchmark/mnist_task/README.md      |  11 +
 .../mnist_task/requirements.txt               |   2 +
 .../katib_benchmark/mnist_task/task.py        |  58 +++++
 7 files changed, 419 insertions(+)
 create mode 100644 experiments/katib_benchmark/experiment_template.yaml
 create mode 100644 experiments/katib_benchmark/grid.yaml
 create mode 100644 experiments/katib_benchmark/katib_benchmark.py
 create mode 100644 experiments/katib_benchmark/mnist_task/Dockerfile
 create mode 100644 experiments/katib_benchmark/mnist_task/README.md
 create mode 100644 experiments/katib_benchmark/mnist_task/requirements.txt
 create mode 100644 experiments/katib_benchmark/mnist_task/task.py

diff --git a/experiments/katib_benchmark/experiment_template.yaml b/experiments/katib_benchmark/experiment_template.yaml
new file mode 100644
index 0000000..3b576d1
--- /dev/null
+++ b/experiments/katib_benchmark/experiment_template.yaml
@@ -0,0 +1,44 @@
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: $study_name
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+  algorithm:
+    algorithmName: grid
+  parallelTrialCount: $worker_num
+  maxTrialCount: $jobs_num
+  maxFailedTrialCount: $worker_num
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.001"
+        max: "0.02"
+        step: "0.0001"
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: $worker_image
+                command:
+                  - "python3"
+                  - "/opt/mnist_task/task.py"
+                  - "--batch-size=64"
+                  - "--lr=$trialParameters"
+            restartPolicy: Never
diff --git a/experiments/katib_benchmark/grid.yaml b/experiments/katib_benchmark/grid.yaml
new file mode 100644
index 0000000..e39d6cf
--- /dev/null
+++ b/experiments/katib_benchmark/grid.yaml
@@ -0,0 +1,44 @@
+---
+apiVersion: kubeflow.org/v1beta1
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: katib-study-79
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+  algorithm:
+    algorithmName: grid
+  parallelTrialCount: 20
+  maxTrialCount: 20
+  maxFailedTrialCount: 20
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.001"
+        max: "0.02"
+        step: "0.0001"
+  trialTemplate:
+    primaryContainerName: training-container
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+    trialSpec:
+      apiVersion: batch/v1
+      kind: Job
+      spec:
+        template:
+          spec:
+            containers:
+              - name: training-container
+                image: witja46/mnist_katib:latest
+                command:
+                  - "python3"
+                  - "/opt/mnist_task/task.py"
+                  - "--batch-size=64"
+                  - "--lr=${trialParameters.learningRate}"
+            restartPolicy: Never
diff --git a/experiments/katib_benchmark/katib_benchmark.py b/experiments/katib_benchmark/katib_benchmark.py
new file mode 100644
index 0000000..b1381d9
--- /dev/null
+++ b/experiments/katib_benchmark/katib_benchmark.py
@@ -0,0 +1,245 @@
+from os import path
+import os
+import sys
+from time import sleep
+from kubernetes.client.rest import ApiException
+import random
+from kubernetes import client, config
+from string import Template
+import yaml
+import docker
+PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../.."))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.benchmark_runner import Benchmark
+
+
+
+
+class KatibBenchmark(Benchmark):
+
+    def __init__(self, objective, grid, resources) -> None:
+        self.objective = objective
+        self.grid = grid
+        self.resources = resources
+        self.group="kubeflow.org"
+        self.version="v1beta1"
+        self.namespace='kubeflow'
+        self.plural="experiments"
+        self.experiment_file_name = "grid.yaml"
+     
+        config.load_kube_config()
+
+        
+
+    
+        if "dockerImageTag" in self.resources:
+            self.trial_tag = self.resources["dockerImageTag"]
+        else:
+            self.trial_tag = "mnist_katib:latest"
+
+        if "dockerUserLogin" in self.resources:
+            self.docker_user = self.resources["dockerUserLogin"]
+            self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
+        else:
+            self.docker_user = ""
+            self.trial_tag = "witja46/mnist_katib:latest"
+        
+        if "dockerUserPassword" in self.resources:
+            self.docker_pasword = self.resources["dockerUserPassword"]
+        else:
+            self.docker_pasword = ""        
+
+
+        if "studyName" in self.resources:
+            self.study_name = self.resources["studyName"]
+        else:
+            self.study_name = f"katib-study-{random.randint(0, 100)}"
+
+        if "workerCpu" in self.resources:
+            self.workerCpu = self.resources["workerCpu"]
+        else:
+            self.workerCpu = 2
+
+        if "workerMemory" in resources:
+            self.workerMemory = self.resources["workerMemory"]
+        else:
+            self.workerMemory = 2
+
+        if "workerCount" in resources:
+            self.workerCount = self.resources["workerCount"]
+        else:
+            self.workerCount = 5
+
+        if "jobs_Count" in resources:
+            self.jobsCount = self.resources["jobs_Count"]
+        else:
+            self.jobsCount = 6
+    def deploy(self):
+        """
+            With the completion of this step the desired architecture of the HPO Framework should be running
+            on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods
+            and services in kubernetes.
+        """
+        print("Deploying katib:")
+        res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master').read()
+        print(res)
+
+
+        #TODO waiit untill all pods are running
+
+
+
+      
+        
+    def setup(self):
+        """
+        Every Operation that is needed before the actual optimization (trial) starts and that is not relevant
+        for starting up workers or the necessary architecture.
+        """
+       
+        #creating experiment yaml          
+             
+        experiment_definition = {
+            "worker_num": self.workerCount,
+            "jobs_num":self.jobsCount,
+            "worker_cpu": self.workerCpu,
+            "worker_mem": f"{self.workerMemory}Gi",
+            "worker_image": self.trial_tag,
+            "study_name": self.study_name,
+            "trialParameters":"${trialParameters.learningRate}"
+        }
+
+        #loading and fulling the template
+        with open(path.join(path.dirname(__file__), "experiment_template.yaml"), "r") as f:
+            job_template = Template(f.read())
+            job_yml_objects = job_template.substitute(experiment_definition)
+            
+        #writing the experiment definition into the file        
+        with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f:
+            f.write(job_yml_objects)
+        print("Experiment yaml created")
+      
+      
+        # Creating new docker image if credentials were passed
+        if "dockerUserLogin" in self.resources:
+           
+            #creating task docker image  
+            print("Creating task docker image")  
+            self.client = docker.client.from_env()
+            image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag)
+            print(f"Image: {self.trial_tag}")
+            for line in logs  :
+                print(line) 
+            
+            #pushing to repo
+            self.client.login(username=self.docker_user, password=self.docker_pasword)
+            for line in self.client.images.push(self.trial_tag, stream=True, decode=True):
+                print(line) 
+        
+        
+
+    def run(self):
+        
+        print("Starting Katib experiment:")
+        api_instance=  client.CustomObjectsApi()
+
+        #Loading experiment definition 
+        with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f:
+            body = yaml.safe_load(f)
+
+            #Starting experiment by creating experiment crd with help of kubernetes API
+            try:
+                api_response = api_instance.create_namespaced_custom_object(
+                    group=self.group,
+                    version=self.version,
+                    namespace=self.namespace,
+                    body=body,
+                    plural=self.plural,
+                )
+                print("Succses: Experiment started")
+            #    print(api_response)  
+            except ApiException as e:
+                print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e)
+            
+ 
+    def collect_benchmark_metrics(self):
+        return super().collect_benchmark_metrics()
+
+
+
+    def get_experiment(self):
+        config.load_kube_config()
+        api = client.CustomObjectsApi()
+        #requesting experiments crd that contains data about finisched experiment
+        try:
+            resource = api.get_namespaced_custom_object_status(
+                    group=self.group,
+                    version=self.version,
+                    namespace=self.namespace,
+                    name=self.study_name,
+                    plural=self.plural,
+                )
+       
+            # print(resource)
+            # print(resource["status"])
+        except ApiException as e:
+            print("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
+        return resource
+
+
+    def collect_run_results(self):
+        
+        print("Collecting run results:")
+        
+        sleep(5)
+        experiment = self.get_experiment()
+        while "status" not in experiment:
+            print("Waitinng for the status")
+            sleep(5)
+            experiment = self.get_experiment()
+            print(experiment)
+        
+    
+        while experiment["status"]["conditions"][-1]["type"]!="Succeeded":
+            experiment = self.get_experiment()
+            print("\nWaiting for the experiment to finish:")
+            if "trialsSucceeded" in experiment["status"]:
+                print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
+            print(experiment["status"]["conditions"][-1])
+            sleep(2)
+           
+       
+        
+        print("\n Experiment finished with following optimal trial:")
+        print(experiment["status"]["currentOptimalTrial"])
+         
+        
+    
+    def test(self):
+        return super().test()
+
+    def undeploy(self):
+        return super().undeploy()
+
+    def main():
+        print("jeb")
+      
+
+
+
+if __name__ == "__main__":
+    #main()
+    bench = KatibBenchmark(1,1,resources={
+        # "dockerUserLogin":"",
+        # "dockerUserPassword":"",
+        # "studyName":""
+        "jobs_Count":20,
+        "workerCount":20
+        })
+    # bench.deploy()
+    bench.setup()
+    bench.run()
+    bench.collect_run_results()
+
+
+
diff --git a/experiments/katib_benchmark/mnist_task/Dockerfile b/experiments/katib_benchmark/mnist_task/Dockerfile
new file mode 100644
index 0000000..1cbfd5b
--- /dev/null
+++ b/experiments/katib_benchmark/mnist_task/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.9-slim
+
+ADD . /opt/mnist_task
+WORKDIR /opt/mnist_task
+
+# Add folder for the logs.
+RUN mkdir /katib
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN chgrp -R 0 /opt/mnist_task \
+  && chmod -R g+rwX /opt/mnist_task \
+  && chgrp -R 0 /katib \
+  && chmod -R g+rwX /katib
+
+ENTRYPOINT ["python3", "/opt/mnist_task/task.py"]
diff --git a/experiments/katib_benchmark/mnist_task/README.md b/experiments/katib_benchmark/mnist_task/README.md
new file mode 100644
index 0000000..3326bc7
--- /dev/null
+++ b/experiments/katib_benchmark/mnist_task/README.md
@@ -0,0 +1,11 @@
+# PyTorch MNIST Image Classification Example
+
+This is PyTorch MNIST image classification training container with saving metrics
+to the file or printing to the StdOut. It uses convolutional neural network to
+train the model.
+
+Katib uses this training container in some Experiments, for instance in the
+[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
+the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
+the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
+and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
diff --git a/experiments/katib_benchmark/mnist_task/requirements.txt b/experiments/katib_benchmark/mnist_task/requirements.txt
new file mode 100644
index 0000000..9852601
--- /dev/null
+++ b/experiments/katib_benchmark/mnist_task/requirements.txt
@@ -0,0 +1,2 @@
+Pillow>=9.1.1
+numpy
\ No newline at end of file
diff --git a/experiments/katib_benchmark/mnist_task/task.py b/experiments/katib_benchmark/mnist_task/task.py
new file mode 100644
index 0000000..a8dcc0c
--- /dev/null
+++ b/experiments/katib_benchmark/mnist_task/task.py
@@ -0,0 +1,58 @@
+from __future__ import print_function
+
+import argparse
+import logging
+import os
+import numpy as np
+import time
+
+def train(times ,epoch):
+    loss = 1
+    for x in np.arange(1,times + 1): 
+        loss = 1 - (x -1 ) / times
+        time.sleep(0.01)
+        msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                epoch, x , times,
+                100. * x / times, loss)
+        logging.info(msg)
+
+
+def test():
+
+    test_accuracy =0.8 
+    logging.info("Validation-accuracy={:.4f}\n".format(
+        test_accuracy))
+
+
+
+def main():
+    
+    #parsing arguments 
+    parser = argparse.ArgumentParser(description="MNIST Example")
+    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="number of epochs to train (default: 10)")
+    parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
+                        help="learning rate (default: 0.01)")
+    args = parser.parse_args()
+  
+    #timestamps format
+    logging.basicConfig(
+            format="%(asctime)s %(levelname)-8s %(message)s",
+            datefmt="%Y-%m-%dT%H:%M:%SZ",
+            level=logging.DEBUG)
+
+   
+    #model taining and testing
+    epochs = args.epochs
+    batch_size = args.batch_size
+    lr = args.lr
+    for epoch in np.arange(1,epochs+1):
+        train(batch_size,epoch)
+        test() 
+
+
+
+if __name__ == "__main__":
+    main()

From 4adc75006247cf73e8d284ff5e222bd3fada4488 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 16 Aug 2022 23:33:24 +0200
Subject: [PATCH 2/9] 1.Added blocking in deploy phase 2.Docker image is now
 created in minikube.

---
 .../experiment_template.yaml                  | 16 +++-
 .../grid.yaml                                 | 26 ++++--
 .../katib_benchmark.py                        | 92 ++++++++++++-------
 .../katib_minikube/mnist_task/Dockerfile      | 16 ++++
 .../mnist_task/README.md                      |  0
 .../mnist_task/requirements.txt               |  0
 experiments/katib_minikube/mnist_task/task.py | 78 ++++++++++++++++
 .../mnist_task_old}/Dockerfile                |  0
 .../katib_minikube/mnist_task_old/README.md   | 11 +++
 .../mnist_task_old/requirements.txt           |  2 +
 .../mnist_task_old}/task.py                   |  0
 experiments/katib_minikube/requirements.txt   |  2 +
 ml_benchmark/__init__.py                      |  2 +-
 13 files changed, 198 insertions(+), 47 deletions(-)
 rename experiments/{katib_benchmark => katib_minikube}/experiment_template.yaml (65%)
 rename experiments/{katib_benchmark => katib_minikube}/grid.yaml (56%)
 rename experiments/{katib_benchmark => katib_minikube}/katib_benchmark.py (70%)
 create mode 100644 experiments/katib_minikube/mnist_task/Dockerfile
 rename experiments/{katib_benchmark => katib_minikube}/mnist_task/README.md (100%)
 rename experiments/{katib_benchmark => katib_minikube}/mnist_task/requirements.txt (100%)
 create mode 100644 experiments/katib_minikube/mnist_task/task.py
 rename experiments/{katib_benchmark/mnist_task => katib_minikube/mnist_task_old}/Dockerfile (100%)
 create mode 100644 experiments/katib_minikube/mnist_task_old/README.md
 create mode 100644 experiments/katib_minikube/mnist_task_old/requirements.txt
 rename experiments/{katib_benchmark/mnist_task => katib_minikube/mnist_task_old}/task.py (100%)
 create mode 100644 experiments/katib_minikube/requirements.txt

diff --git a/experiments/katib_benchmark/experiment_template.yaml b/experiments/katib_minikube/experiment_template.yaml
similarity index 65%
rename from experiments/katib_benchmark/experiment_template.yaml
rename to experiments/katib_minikube/experiment_template.yaml
index 3b576d1..e733d32 100644
--- a/experiments/katib_benchmark/experiment_template.yaml
+++ b/experiments/katib_minikube/experiment_template.yaml
@@ -7,8 +7,8 @@ metadata:
 spec:
   objective:
     type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
+    goal: 0.8
+    objectiveMetricName: precision
   algorithm:
     algorithmName: grid
   parallelTrialCount: $worker_num
@@ -20,7 +20,7 @@ spec:
       feasibleSpace:
         min: "0.001"
         max: "0.02"
-        step: "0.0001"
+        step: "0.001"
   trialTemplate:
     primaryContainerName: training-container
     trialParameters:
@@ -36,9 +36,17 @@ spec:
             containers:
               - name: training-container
                 image: $worker_image
+                imagePullPolicy: IfNotPresent
                 command:
                   - "python3"
-                  - "/opt/mnist_task/task.py"
+                  - "experiments/katib_minikube/mnist_task/task.py"
                   - "--batch-size=64"
                   - "--lr=$trialParameters"
+                env:
+                  - name: STUDY_NAME
+                    value: "$study_name"
+                  - name: DB_CONN
+                    value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+                  - name: "METRICS_STORAGE_HOST"
+                    value: "$metrics_ip"
             restartPolicy: Never
diff --git a/experiments/katib_benchmark/grid.yaml b/experiments/katib_minikube/grid.yaml
similarity index 56%
rename from experiments/katib_benchmark/grid.yaml
rename to experiments/katib_minikube/grid.yaml
index e39d6cf..c37e5a0 100644
--- a/experiments/katib_benchmark/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,24 +3,24 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-79
+  name: katib-study-74
 spec:
   objective:
     type: maximize
-    goal: 0.99
-    objectiveMetricName: Validation-accuracy
+    goal: 0.8
+    objectiveMetricName: precision
   algorithm:
     algorithmName: grid
-  parallelTrialCount: 20
-  maxTrialCount: 20
-  maxFailedTrialCount: 20
+  parallelTrialCount: 4
+  maxTrialCount: 4
+  maxFailedTrialCount: 4
   parameters:
     - name: lr
       parameterType: double
       feasibleSpace:
         min: "0.001"
         max: "0.02"
-        step: "0.0001"
+        step: "0.001"
   trialTemplate:
     primaryContainerName: training-container
     trialParameters:
@@ -35,10 +35,18 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: witja46/mnist_katib:latest
+                image: mnist_katib
+                imagePullPolicy: IfNotPresent
                 command:
                   - "python3"
-                  - "/opt/mnist_task/task.py"
+                  - "experiments/katib_minikube/mnist_task/task.py"
                   - "--batch-size=64"
                   - "--lr=${trialParameters.learningRate}"
+                env:
+                  - name: STUDY_NAME
+                    value: "katib-study-74"
+                  - name: DB_CONN
+                    value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
+                  - name: "METRICS_STORAGE_HOST"
+                    value: "134.101.4.161"
             restartPolicy: Never
diff --git a/experiments/katib_benchmark/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
similarity index 70%
rename from experiments/katib_benchmark/katib_benchmark.py
rename to experiments/katib_minikube/katib_benchmark.py
index b1381d9..56131f7 100644
--- a/experiments/katib_benchmark/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -2,15 +2,18 @@
 import os
 import sys
 from time import sleep
+from urllib.request import urlopen
 from kubernetes.client.rest import ApiException
 import random
-from kubernetes import client, config
+from kubernetes import client, config, watch
+from kubernetes.client.rest import ApiException
 from string import Template
 import yaml
 import docker
-PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../.."))
-sys.path.append(PROJECT_ROOT)
+import logging as log
+
 from ml_benchmark.benchmark_runner import Benchmark
+from ml_benchmark.utils.image_build_wrapper import builder_from_string
 
 
 
@@ -26,28 +29,23 @@ def __init__(self, objective, grid, resources) -> None:
         self.namespace='kubeflow'
         self.plural="experiments"
         self.experiment_file_name = "grid.yaml"
-     
+        self.metrics_ip = resources.get("metricsIP")     
         config.load_kube_config()
 
+        self.logging_level= self.resources.get("loggingLevel",log.INFO)
+
+        log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level)
+
         
 
     
         if "dockerImageTag" in self.resources:
             self.trial_tag = self.resources["dockerImageTag"]
         else:
-            self.trial_tag = "mnist_katib:latest"
+            self.trial_tag = "mnist_katib"
+
 
-        if "dockerUserLogin" in self.resources:
-            self.docker_user = self.resources["dockerUserLogin"]
-            self.trial_tag =  f'{self.docker_user}/{self.trial_tag}'
-        else:
-            self.docker_user = ""
-            self.trial_tag = "witja46/mnist_katib:latest"
         
-        if "dockerUserPassword" in self.resources:
-            self.docker_pasword = self.resources["dockerUserPassword"]
-        else:
-            self.docker_pasword = ""        
 
 
         if "studyName" in self.resources:
@@ -85,6 +83,38 @@ def deploy(self):
         print(res)
 
 
+
+        config.load_kube_config()
+        w = watch.Watch()
+        c = client.CoreV1Api()
+        deployed = 0
+        log.info("Waiting for all Katib pods to be ready:")
+        # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments 
+        monitored_pods = ["katib-cert-generator","katib-db-manager","katib-mysql","katib-ui","katib-controller"]
+        # TODO changing to list_namespaced_deployments?
+        for e in w.stream(c.list_namespaced_pod, namespace=self.namespace):
+            ob = e["object"]          
+
+            for name in monitored_pods:
+
+                #checking if it is one of the pods that we want to monitor 
+                if name in ob.metadata.name:
+
+                    # Checking if the pod already is runnig and its underlying containers are ready
+                    if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: 
+                        log.info(f'{ob.metadata.name} is ready')
+                        monitored_pods.remove(name)
+                        deployed = deployed + 1
+
+                        #if all monitored pods are running the deployment process was ended
+                        if(deployed == 4 ):
+                            w.stop()
+                            log.info("Finished deploying crucial pods")
+
+
+
+
+
         #TODO waiit untill all pods are running
 
 
@@ -106,7 +136,8 @@ def setup(self):
             "worker_mem": f"{self.workerMemory}Gi",
             "worker_image": self.trial_tag,
             "study_name": self.study_name,
-            "trialParameters":"${trialParameters.learningRate}"
+            "trialParameters":"${trialParameters.learningRate}",
+            "metrics_ip":self.metrics_ip
         }
 
         #loading and fulling the template
@@ -120,21 +151,15 @@ def setup(self):
         print("Experiment yaml created")
       
       
-        # Creating new docker image if credentials were passed
-        if "dockerUserLogin" in self.resources:
-           
-            #creating task docker image  
-            print("Creating task docker image")  
-            self.client = docker.client.from_env()
-            image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag)
-            print(f"Image: {self.trial_tag}")
-            for line in logs  :
-                print(line) 
-            
-            #pushing to repo
-            self.client.login(username=self.docker_user, password=self.docker_pasword)
-            for line in self.client.images.push(self.trial_tag, stream=True, decode=True):
-                print(line) 
+       # Creating new docker image if credentials were passed
+        log.info("Creating task docker image")   
+        #creating docker image inside of the minikube   
+        self.image_builder = builder_from_string("minikube")()
+        PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
+        res = self.image_builder.deploy_image(
+        "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
+        print(res)
+        print(f"Image: {self.trial_tag}")  
         
         
 
@@ -233,8 +258,9 @@ def main():
         # "dockerUserLogin":"",
         # "dockerUserPassword":"",
         # "studyName":""
-        "jobs_Count":20,
-        "workerCount":20
+        "jobs_Count":4,
+        "workerCount":4,
+        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
         })
     # bench.deploy()
     bench.setup()
diff --git a/experiments/katib_minikube/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task/Dockerfile
new file mode 100644
index 0000000..7e62b3b
--- /dev/null
+++ b/experiments/katib_minikube/mnist_task/Dockerfile
@@ -0,0 +1,16 @@
+#TODO change to some slimer base image
+FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
+#FROM python:3.9-slim
+
+RUN pip install pip --upgrade
+
+COPY experiments experiments
+# COPY data data
+COPY setup.py setup.py
+COPY ml_benchmark ml_benchmark
+
+
+
+RUN pip install -e .
+
+CMD ["python", "experiments/katib_minikube/mnist_task/task.py"]
diff --git a/experiments/katib_benchmark/mnist_task/README.md b/experiments/katib_minikube/mnist_task/README.md
similarity index 100%
rename from experiments/katib_benchmark/mnist_task/README.md
rename to experiments/katib_minikube/mnist_task/README.md
diff --git a/experiments/katib_benchmark/mnist_task/requirements.txt b/experiments/katib_minikube/mnist_task/requirements.txt
similarity index 100%
rename from experiments/katib_benchmark/mnist_task/requirements.txt
rename to experiments/katib_minikube/mnist_task/requirements.txt
diff --git a/experiments/katib_minikube/mnist_task/task.py b/experiments/katib_minikube/mnist_task/task.py
new file mode 100644
index 0000000..2d3af82
--- /dev/null
+++ b/experiments/katib_minikube/mnist_task/task.py
@@ -0,0 +1,78 @@
+import argparse
+import logging
+import numpy as np
+import time
+import os
+import sys
+PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../"))
+sys.path.append(PROJECT_ROOT)
+from ml_benchmark.workload.mnist.mnist_task import MnistTask
+
+
+def train(times ,epoch):
+    loss = 1
+    for x in np.arange(1,times + 1): 
+        loss = 1 - (x -1 ) / times
+        time.sleep(0.1)
+        msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                epoch, x , times,
+                100. * x / times, loss)
+        logging.info(msg)
+
+
+def test():
+
+    test_accuracy =0.8 
+    logging.info("Validation-accuracy={:.4f}\n".format(
+        test_accuracy))
+
+
+
+def main():
+   
+    
+    #parsing arguments 
+    parser = argparse.ArgumentParser(description="MNIST Example")
+    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
+                        help="input batch size for training (default: 64)")
+    parser.add_argument("--epochs", type=int, default=10, metavar="N",
+                        help="number of epochs to train (default: 10)")
+    parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
+                        help="learning rate (default: 0.01)")
+    args = parser.parse_args()
+    epochs = args.epochs
+    batch_size = args.batch_size
+    lr = args.lr
+
+
+
+
+
+
+    #MnistTask
+    task = MnistTask(config_init={"epochs": epochs})
+    objective = task.create_objective()
+    #TODO add the weight decay to the definition of the template
+    objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
+    objective.train()
+    validation_scores = objective.validate()
+    
+  
+    #Geting results
+    avg = validation_scores["weighted avg"]
+    print(f'precision={avg["precision"]}')
+    # print(f'f1-score={avg["f1-score"]}')
+  
+    # logging.basicConfig(
+    #         format="%(asctime)s %(levelname)-8s %(message)s",
+    #         datefmt="%Y-%m-%dT%H:%M:%SZ",
+    #         level=logging.DEBUG)
+  
+
+    # logging.info("precision={:.4f}\n".format(
+    #     avg["precision"]))
+
+    # logging.info("f1-score={:.4f}\n".format(
+    #     avg["f1-score"]))
+if __name__ == "__main__":
+    main()
diff --git a/experiments/katib_benchmark/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task_old/Dockerfile
similarity index 100%
rename from experiments/katib_benchmark/mnist_task/Dockerfile
rename to experiments/katib_minikube/mnist_task_old/Dockerfile
diff --git a/experiments/katib_minikube/mnist_task_old/README.md b/experiments/katib_minikube/mnist_task_old/README.md
new file mode 100644
index 0000000..3326bc7
--- /dev/null
+++ b/experiments/katib_minikube/mnist_task_old/README.md
@@ -0,0 +1,11 @@
+# PyTorch MNIST Image Classification Example
+
+This is PyTorch MNIST image classification training container with saving metrics
+to the file or printing to the StdOut. It uses convolutional neural network to
+train the model.
+
+Katib uses this training container in some Experiments, for instance in the
+[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
+the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
+the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
+and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
diff --git a/experiments/katib_minikube/mnist_task_old/requirements.txt b/experiments/katib_minikube/mnist_task_old/requirements.txt
new file mode 100644
index 0000000..9852601
--- /dev/null
+++ b/experiments/katib_minikube/mnist_task_old/requirements.txt
@@ -0,0 +1,2 @@
+Pillow>=9.1.1
+numpy
\ No newline at end of file
diff --git a/experiments/katib_benchmark/mnist_task/task.py b/experiments/katib_minikube/mnist_task_old/task.py
similarity index 100%
rename from experiments/katib_benchmark/mnist_task/task.py
rename to experiments/katib_minikube/mnist_task_old/task.py
diff --git a/experiments/katib_minikube/requirements.txt b/experiments/katib_minikube/requirements.txt
new file mode 100644
index 0000000..8ef8413
--- /dev/null
+++ b/experiments/katib_minikube/requirements.txt
@@ -0,0 +1,2 @@
+kubernetes==24.2.0
+torch
\ No newline at end of file
diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py
index f628f98..b28a677 100644
--- a/ml_benchmark/__init__.py
+++ b/ml_benchmark/__init__.py
@@ -1,7 +1,7 @@
 __version__ = "develop"
 install_requires = [
         "scikit-learn==0.24.2",
-        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.4.2",
+        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2",
         "psycopg2-binary"],
 test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"]
 URL = "https://github.com/gebauerm/ml_benchmark"

From 0ebbeec34657cbbd4aace83f3593241b01b75a43 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 17 Aug 2022 12:40:26 +0200
Subject: [PATCH 3/9] Added undeploy with blocking.

---
 experiments/katib_minikube/grid.yaml          |   4 +-
 experiments/katib_minikube/katib_benchmark.py | 105 ++++++++++++++----
 ml_benchmark/__init__.py                      |   2 +-
 3 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index c37e5a0..a472b10 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-74
+  name: katib-study-5
 spec:
   objective:
     type: maximize
@@ -44,7 +44,7 @@ spec:
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-74"
+                    value: "katib-study-5"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index 56131f7..34bbbc6 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -29,7 +29,8 @@ def __init__(self, objective, grid, resources) -> None:
         self.namespace='kubeflow'
         self.plural="experiments"
         self.experiment_file_name = "grid.yaml"
-        self.metrics_ip = resources.get("metricsIP")     
+        self.metrics_ip = resources.get("metricsIP")
+        self.generate_new_docker_image = resources.get("generateNewDockerImage",True)     
         config.load_kube_config()
 
         self.logging_level= self.resources.get("loggingLevel",log.INFO)
@@ -79,7 +80,7 @@ def deploy(self):
             and services in kubernetes.
         """
         print("Deploying katib:")
-        res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master').read()
+        res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
         print(res)
 
 
@@ -151,15 +152,16 @@ def setup(self):
         print("Experiment yaml created")
       
       
-       # Creating new docker image if credentials were passed
-        log.info("Creating task docker image")   
-        #creating docker image inside of the minikube   
-        self.image_builder = builder_from_string("minikube")()
-        PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
-        res = self.image_builder.deploy_image(
-        "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
-        print(res)
-        print(f"Image: {self.trial_tag}")  
+       #only generating the docker image if specified so.
+        if self.generate_new_docker_image:
+            log.info("Creating task docker image")   
+            #creating docker image inside of the minikube   
+            self.image_builder = builder_from_string("minikube")()
+            PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
+            res = self.image_builder.deploy_image(
+            "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
+            print(res)
+            print(f"Image: {self.trial_tag}")  
         
         
 
@@ -182,7 +184,7 @@ def run(self):
                     plural=self.plural,
                 )
                 print("Succses: Experiment started")
-            #    print(api_response)  
+                print(api_response)  
             except ApiException as e:
                 print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e)
             
@@ -215,14 +217,42 @@ def get_experiment(self):
     def collect_run_results(self):
         
         print("Collecting run results:")
+        config.load_kube_config()
+        w = watch.Watch()
+        c = client.CustomObjectsApi()
+        deployed = 0
+        
+        #TODO is there a way to run watch on namespaced custome object?
+        # for e in w.stream(c.get_namespaced_custom_object_with_http_info,
+        #                     group=self.group,
+        #                     version=self.version,
+        #                     namespace=self.namespace,
+        #                     name=self.study_name,
+        #                     plural=self.plural):
+        #     experiment = e["object"]
+        #     if "status" not in experiment:
+        #         print("Waitinng for the status")
+        #         sleep(5)
+        #     elif experiment["status"]["conditions"][-1]["type"]!="Succeeded":
+        #          print("chuja")
+        #          if "trialsSucceeded" in experiment["status"]:
+        #             print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
+        #             print(experiment["status"]["conditions"][-1])
+        #             sleep(2)
+        #     else:
+        #         print("\n Experiment finished with following optimal trial:")
+        #         print(experiment["status"]["currentOptimalTrial"])  
+                
+                
+           
+
         
-        sleep(5)
         experiment = self.get_experiment()
         while "status" not in experiment:
             print("Waitinng for the status")
             sleep(5)
             experiment = self.get_experiment()
-            print(experiment)
+            # print(experiment)
         
     
         while experiment["status"]["conditions"][-1]["type"]!="Succeeded":
@@ -244,7 +274,42 @@ def test(self):
         return super().test()
 
     def undeploy(self):
-        return super().undeploy()
+        print("Undeploying katib:")
+        # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"')
+       
+        
+        
+        w = watch.Watch()
+        c = client.CoreV1Api()
+        log.info("Deleteing the namespace:")
+        res = c.delete_namespace_with_http_info(name=self.namespace)    
+
+
+        log.info("Checking status of the  namespace:")
+        try:
+            log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
+        except ApiException as err:
+            log.info(err)
+            log.info("Namespace sucessfully deleted")
+            log.info("Finished undeploying")
+            return
+        
+      
+        #if the namespace was still existent we must wait till it is really terminated
+        for e in w.stream(c.list_namespace):
+            ob = e["object"]
+            # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found
+            #TODO do this in other way            
+            if ob.metadata.name == self.namespace:
+                try:
+                    log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
+                except ApiException as err:
+                    log.info(err)
+                    log.info("Namespace sucessfully deleted")
+                    w.stop()
+                    break
+
+        log.info("Finished undeploying")
 
     def main():
         print("jeb")
@@ -258,14 +323,16 @@ def main():
         # "dockerUserLogin":"",
         # "dockerUserPassword":"",
         # "studyName":""
-        "jobs_Count":4,
-        "workerCount":4,
-        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip()
+        "jobs_Count":10,
+        "workerCount":10,
+        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
+        "generateNewDockerImage": True
         })
-    # bench.deploy()
+    bench.deploy()
     bench.setup()
     bench.run()
     bench.collect_run_results()
+    bench.undeploy()
 
 
 
diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py
index b28a677..2742851 100644
--- a/ml_benchmark/__init__.py
+++ b/ml_benchmark/__init__.py
@@ -1,7 +1,7 @@
 __version__ = "develop"
 install_requires = [
         "scikit-learn==0.24.2",
-        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2",
+        "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4..2",
         "psycopg2-binary"],
 test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"]
 URL = "https://github.com/gebauerm/ml_benchmark"

From 41c48b054acdf560e5eb3b7fef46ddc6f5621110 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 17 Aug 2022 12:58:58 +0200
Subject: [PATCH 4/9] Switched to logging, undeploying with kubectl delete

---
 experiments/katib_minikube/grid.yaml          | 10 ++--
 experiments/katib_minikube/katib_benchmark.py | 58 +++++++++----------
 2 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index a472b10..df91f9a 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-5
+  name: katib-study-22
 spec:
   objective:
     type: maximize
@@ -11,9 +11,9 @@ spec:
     objectiveMetricName: precision
   algorithm:
     algorithmName: grid
-  parallelTrialCount: 4
-  maxTrialCount: 4
-  maxFailedTrialCount: 4
+  parallelTrialCount: 10
+  maxTrialCount: 10
+  maxFailedTrialCount: 10
   parameters:
     - name: lr
       parameterType: double
@@ -44,7 +44,7 @@ spec:
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-5"
+                    value: "katib-study-22"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index 34bbbc6..1009a24 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -79,9 +79,9 @@ def deploy(self):
             on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods
             and services in kubernetes.
         """
-        print("Deploying katib:")
+        log.info("Deploying katib:")
         res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
-        print(res)
+        log.info(res)
 
 
 
@@ -149,7 +149,7 @@ def setup(self):
         #writing the experiment definition into the file        
         with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f:
             f.write(job_yml_objects)
-        print("Experiment yaml created")
+        log.info("Experiment yaml created")
       
       
        #only generating the docker image if specified so.
@@ -160,14 +160,14 @@ def setup(self):
             PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
             res = self.image_builder.deploy_image(
             "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
-            print(res)
-            print(f"Image: {self.trial_tag}")  
+            log.info(res)
+            log.info(f"Image: {self.trial_tag}")  
         
         
 
     def run(self):
         
-        print("Starting Katib experiment:")
+        log.info("Starting Katib experiment:")
         api_instance=  client.CustomObjectsApi()
 
         #Loading experiment definition 
@@ -183,10 +183,10 @@ def run(self):
                     body=body,
                     plural=self.plural,
                 )
-                print("Succses: Experiment started")
-                print(api_response)  
+                log.info("Succses: Experiment started")
+                log.info(api_response)  
             except ApiException as e:
-                print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e)
+                log.info("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e)
             
  
     def collect_benchmark_metrics(self):
@@ -207,16 +207,16 @@ def get_experiment(self):
                     plural=self.plural,
                 )
        
-            # print(resource)
-            # print(resource["status"])
+            # log.info(resource)
+            # log.info(resource["status"])
         except ApiException as e:
-            print("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
+            log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
         return resource
 
 
     def collect_run_results(self):
         
-        print("Collecting run results:")
+        log.info("Collecting run results:")
         config.load_kube_config()
         w = watch.Watch()
         c = client.CustomObjectsApi()
@@ -231,17 +231,17 @@ def collect_run_results(self):
         #                     plural=self.plural):
         #     experiment = e["object"]
         #     if "status" not in experiment:
-        #         print("Waitinng for the status")
+        #         log.info("Waitinng for the status")
         #         sleep(5)
         #     elif experiment["status"]["conditions"][-1]["type"]!="Succeeded":
-        #          print("chuja")
+        #          log.info("chuja")
         #          if "trialsSucceeded" in experiment["status"]:
-        #             print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
-        #             print(experiment["status"]["conditions"][-1])
+        #             log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
+        #             log.info(experiment["status"]["conditions"][-1])
         #             sleep(2)
         #     else:
-        #         print("\n Experiment finished with following optimal trial:")
-        #         print(experiment["status"]["currentOptimalTrial"])  
+        #         log.info("\n Experiment finished with following optimal trial:")
+        #         log.info(experiment["status"]["currentOptimalTrial"])  
                 
                 
            
@@ -249,24 +249,24 @@ def collect_run_results(self):
         
         experiment = self.get_experiment()
         while "status" not in experiment:
-            print("Waitinng for the status")
+            log.info("Waitinng for the status")
             sleep(5)
             experiment = self.get_experiment()
-            # print(experiment)
+            # log.info(experiment)
         
     
         while experiment["status"]["conditions"][-1]["type"]!="Succeeded":
             experiment = self.get_experiment()
-            print("\nWaiting for the experiment to finish:")
+            log.info("\nWaiting for the experiment to finish:")
             if "trialsSucceeded" in experiment["status"]:
-                print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
-            print(experiment["status"]["conditions"][-1])
+                log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
+            log.info(experiment["status"]["conditions"][-1])
             sleep(2)
            
        
         
-        print("\n Experiment finished with following optimal trial:")
-        print(experiment["status"]["currentOptimalTrial"])
+        log.info("\n Experiment finished with following optimal trial:")
+        log.info(experiment["status"]["currentOptimalTrial"])
          
         
     
@@ -274,8 +274,8 @@ def test(self):
         return super().test()
 
     def undeploy(self):
-        print("Undeploying katib:")
-        # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"')
+        log.info("Undeploying katib:")
+        res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"')
        
         
         
@@ -311,8 +311,6 @@ def undeploy(self):
 
         log.info("Finished undeploying")
 
-    def main():
-        print("jeb")
       
 
 

From 3940d11896d54d20f0b127dca1a32d4124c59f6e Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 17 Aug 2022 13:20:29 +0200
Subject: [PATCH 5/9] Fixes in undeploy

---
 experiments/katib_minikube/katib_benchmark.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index 1009a24..f96ecea 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -275,14 +275,14 @@ def test(self):
 
     def undeploy(self):
         log.info("Undeploying katib:")
-        res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"')
-       
+        res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
+        log.info(res)
         
         
         w = watch.Watch()
         c = client.CoreV1Api()
-        log.info("Deleteing the namespace:")
-        res = c.delete_namespace_with_http_info(name=self.namespace)    
+        # log.info("Deleteing the namespace:")
+        # res = c.delete_namespace_with_http_info(name=self.namespace)    
 
 
         log.info("Checking status of the  namespace:")
@@ -326,10 +326,10 @@ def undeploy(self):
         "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
         "generateNewDockerImage": True
         })
-    bench.deploy()
-    bench.setup()
-    bench.run()
-    bench.collect_run_results()
+    # bench.deploy()
+    # bench.setup()
+    # bench.run()
+    # bench.collect_run_results()
     bench.undeploy()
 
 

From 0c7f6e31284fc527b08f4fa86ea1900ef1e106a4 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 23 Aug 2022 12:47:17 +0200
Subject: [PATCH 6/9] 1. Undeploy works now. 2. Small changes in light_task

---
 .../katib_minikube/experiment_template.yaml   |  2 +-
 experiments/katib_minikube/grid.yaml          |  8 +--
 experiments/katib_minikube/katib_benchmark.py | 53 +++++++++++++++----
 .../katib_minikube/light_task/Dockerfile      | 14 +++++
 .../{mnist_task_old => light_task}/README.md  |  0
 .../requirements.txt                          |  0
 .../{mnist_task_old => light_task}/task.py    | 30 ++++++-----
 .../katib_minikube/mnist_task_old/Dockerfile  | 15 ------
 8 files changed, 78 insertions(+), 44 deletions(-)
 create mode 100644 experiments/katib_minikube/light_task/Dockerfile
 rename experiments/katib_minikube/{mnist_task_old => light_task}/README.md (100%)
 rename experiments/katib_minikube/{mnist_task_old => light_task}/requirements.txt (100%)
 rename experiments/katib_minikube/{mnist_task_old => light_task}/task.py (65%)
 delete mode 100644 experiments/katib_minikube/mnist_task_old/Dockerfile

diff --git a/experiments/katib_minikube/experiment_template.yaml b/experiments/katib_minikube/experiment_template.yaml
index e733d32..1e719a9 100644
--- a/experiments/katib_minikube/experiment_template.yaml
+++ b/experiments/katib_minikube/experiment_template.yaml
@@ -39,7 +39,7 @@ spec:
                 imagePullPolicy: IfNotPresent
                 command:
                   - "python3"
-                  - "experiments/katib_minikube/mnist_task/task.py"
+                  - "experiments/katib_minikube/$worker_image/task.py"
                   - "--batch-size=64"
                   - "--lr=$trialParameters"
                 env:
diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index df91f9a..0dc96de 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-22
+  name: katib-study-80
 spec:
   objective:
     type: maximize
@@ -35,16 +35,16 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: mnist_katib
+                image: light_task
                 imagePullPolicy: IfNotPresent
                 command:
                   - "python3"
-                  - "experiments/katib_minikube/mnist_task/task.py"
+                  - "experiments/katib_minikube/light_task/task.py"
                   - "--batch-size=64"
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-22"
+                    value: "katib-study-80"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index f96ecea..1443ef4 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -159,7 +159,7 @@ def setup(self):
             self.image_builder = builder_from_string("minikube")()
             PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../"))
             res = self.image_builder.deploy_image(
-            "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT)
+            f'experiments/katib_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT)
             log.info(res)
             log.info(f"Image: {self.trial_tag}")  
         
@@ -223,7 +223,7 @@ def collect_run_results(self):
         deployed = 0
         
         #TODO is there a way to run watch on namespaced custome object?
-        # for e in w.stream(c.get_namespaced_custom_object_with_http_info,
+        # for e in w.stream(c.get_namespaced_custom_object,
         #                     group=self.group,
         #                     version=self.version,
         #                     namespace=self.namespace,
@@ -274,15 +274,36 @@ def test(self):
         return super().test()
 
     def undeploy(self):
-        log.info("Undeploying katib:")
-        res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
-        log.info(res)
+        log.info("Deleteing the experiment crd from the cluster")
+        # res = os.popen('kubectl delete -f grid.yaml').read()
+        # log.info(res)
+        config.load_kube_config()
+        api = client.CustomObjectsApi()
+        try:
+            resource = api.delete_collection_namespaced_custom_object(
+                    group=self.group,
+                    version=self.version,
+                    namespace=self.namespace,
+                    plural=self.plural,
+                )
+            log.info(resource)
+            # log.info(resource)
+            # log.info(resource["status"])
+        except ApiException as e:
+            log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
+
+
         
         
+        # log.info("Undeploying katib:")
+        # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
+        # log.info(res)
+
+
         w = watch.Watch()
         c = client.CoreV1Api()
-        # log.info("Deleteing the namespace:")
-        # res = c.delete_namespace_with_http_info(name=self.namespace)    
+        log.info("Deleteing the namespace:")
+        res = c.delete_namespace_with_http_info(name=self.namespace)    
 
 
         log.info("Checking status of the  namespace:")
@@ -291,10 +312,15 @@ def undeploy(self):
         except ApiException as err:
             log.info(err)
             log.info("Namespace sucessfully deleted")
+            
+            log.info("Deleteing task docker image from minikube")
+            sleep(1)
+            self.image_builder.cleanup(self.trial_tag)
             log.info("Finished undeploying")
             return
         
       
+        log.info("Namespace still not termintated")
         #if the namespace was still existent we must wait till it is really terminated
         for e in w.stream(c.list_namespace):
             ob = e["object"]
@@ -306,6 +332,10 @@ def undeploy(self):
                 except ApiException as err:
                     log.info(err)
                     log.info("Namespace sucessfully deleted")
+                    
+                    log.info("Deleteing task docker image from minikube")
+                    sleep(1)
+                    self.image_builder.cleanup(self.trial_tag)
                     w.stop()
                     break
 
@@ -322,14 +352,15 @@ def undeploy(self):
         # "dockerUserPassword":"",
         # "studyName":""
         "jobs_Count":10,
+        "dockerImageTag":"light_task",
         "workerCount":10,
         "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
         "generateNewDockerImage": True
         })
-    # bench.deploy()
-    # bench.setup()
-    # bench.run()
-    # bench.collect_run_results()
+    bench.deploy()
+    bench.setup()
+    bench.run()
+    bench.collect_run_results()
     bench.undeploy()
 
 
diff --git a/experiments/katib_minikube/light_task/Dockerfile b/experiments/katib_minikube/light_task/Dockerfile
new file mode 100644
index 0000000..b2bd6aa
--- /dev/null
+++ b/experiments/katib_minikube/light_task/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.9-slim
+
+RUN pip install pip --upgrade
+
+COPY experiments experiments
+# COPY data data
+COPY setup.py setup.py
+COPY ml_benchmark ml_benchmark
+
+
+
+RUN pip install -e .
+
+CMD ["python", "experiments/katib_minikube/task_light/task.py"]
\ No newline at end of file
diff --git a/experiments/katib_minikube/mnist_task_old/README.md b/experiments/katib_minikube/light_task/README.md
similarity index 100%
rename from experiments/katib_minikube/mnist_task_old/README.md
rename to experiments/katib_minikube/light_task/README.md
diff --git a/experiments/katib_minikube/mnist_task_old/requirements.txt b/experiments/katib_minikube/light_task/requirements.txt
similarity index 100%
rename from experiments/katib_minikube/mnist_task_old/requirements.txt
rename to experiments/katib_minikube/light_task/requirements.txt
diff --git a/experiments/katib_minikube/mnist_task_old/task.py b/experiments/katib_minikube/light_task/task.py
similarity index 65%
rename from experiments/katib_minikube/mnist_task_old/task.py
rename to experiments/katib_minikube/light_task/task.py
index a8dcc0c..1101b71 100644
--- a/experiments/katib_minikube/mnist_task_old/task.py
+++ b/experiments/katib_minikube/light_task/task.py
@@ -1,26 +1,30 @@
 from __future__ import print_function
 
 import argparse
+import imp
 import logging
 import os
 import numpy as np
 import time
+from ml_benchmark.latency_tracker import latency_decorator
 
-def train(times ,epoch):
+@latency_decorator
+def train(times ,epochs):
     loss = 1
-    for x in np.arange(1,times + 1): 
-        loss = 1 - (x -1 ) / times
-        time.sleep(0.01)
-        msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
-                epoch, x , times,
-                100. * x / times, loss)
-        logging.info(msg)
+    for epoch in range(epochs):
+        for x in np.arange(1,times + 1): 
+            loss = 1 - (x -1 ) / times
+            time.sleep(0.01)
+            msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
+                    epoch, x , times,
+                    100. * x / times, loss)
+            logging.info(msg)
 
 
 def test():
 
-    test_accuracy =0.8 
-    logging.info("Validation-accuracy={:.4f}\n".format(
+    test_accuracy =0.7 
+    logging.info("precision={:.4f}\n".format(
         test_accuracy))
 
 
@@ -48,9 +52,9 @@ def main():
     epochs = args.epochs
     batch_size = args.batch_size
     lr = args.lr
-    for epoch in np.arange(1,epochs+1):
-        train(batch_size,epoch)
-        test() 
+    
+    train(batch_size,epochs)
+    test() 
 
 
 
diff --git a/experiments/katib_minikube/mnist_task_old/Dockerfile b/experiments/katib_minikube/mnist_task_old/Dockerfile
deleted file mode 100644
index 1cbfd5b..0000000
--- a/experiments/katib_minikube/mnist_task_old/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM python:3.9-slim
-
-ADD . /opt/mnist_task
-WORKDIR /opt/mnist_task
-
-# Add folder for the logs.
-RUN mkdir /katib
-RUN pip install --no-cache-dir -r requirements.txt
-
-RUN chgrp -R 0 /opt/mnist_task \
-  && chmod -R g+rwX /opt/mnist_task \
-  && chgrp -R 0 /katib \
-  && chmod -R g+rwX /katib
-
-ENTRYPOINT ["python3", "/opt/mnist_task/task.py"]

From 2e38ed21b1be93ba1402d914bac21e1d7ef6093f Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 24 Aug 2022 13:33:37 +0200
Subject: [PATCH 7/9] fixes in deploy

---
 experiments/katib_minikube/grid.yaml          |   4 +-
 experiments/katib_minikube/katib_benchmark.py | 115 ++++++++----------
 2 files changed, 54 insertions(+), 65 deletions(-)

diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index 0dc96de..d873f53 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-80
+  name: katib-study-48
 spec:
   objective:
     type: maximize
@@ -44,7 +44,7 @@ spec:
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-80"
+                    value: "katib-study-48"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index 1443ef4..c265eb3 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -34,6 +34,7 @@ def __init__(self, objective, grid, resources) -> None:
         config.load_kube_config()
 
         self.logging_level= self.resources.get("loggingLevel",log.INFO)
+        self.clean_up = self.resources.get("cleanUp",True)
 
         log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level)
 
@@ -79,6 +80,8 @@ def deploy(self):
             on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods
             and services in kubernetes.
         """
+
+
         log.info("Deploying katib:")
         res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
         log.info(res)
@@ -106,17 +109,23 @@ def deploy(self):
                         log.info(f'{ob.metadata.name} is ready')
                         monitored_pods.remove(name)
                         deployed = deployed + 1
+                        log.info(monitored_pods)
 
-                        #if all monitored pods are running the deployment process was ended
-                        if(deployed == 4 ):
-                            w.stop()
-                            log.info("Finished deploying crucial pods")
+                    # Checking for status of cert generator  
+                    elif name == "katib-cert-generator" and ob.status.phase == "Succeeded": 
+                        log.info(f'{ob.metadata.name} is Succeeded')
+                        monitored_pods.remove(name)
+                        log.info(monitored_pods)
 
 
+                    #if all monitored pods are running the deployment process was ended
+                    if not monitored_pods:
 
+                        w.stop()
+                        log.info("Finished deploying crucial pods")
 
 
-        #TODO waiit untill all pods are running
+        
 
 
 
@@ -188,7 +197,28 @@ def run(self):
             except ApiException as e:
                 log.info("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e)
             
- 
+        
+        #Blocking untill the run is finished
+        #The GET /apis/{group}/{version}/namespaces/{namespace}/{plural}/{name} endpoint doesnt support watch argument.
+
+        experiment = self.get_experiment()
+        while "status" not in experiment:
+            log.info("Waitinng for the status")
+            sleep(5)
+            experiment = self.get_experiment()
+            # log.info(experiment)
+        
+    
+        while experiment["status"]["conditions"][-1]["type"]!="Succeeded":
+            experiment = self.get_experiment()
+            succeeded =  experiment["status"].get("trialsSucceeded",0)
+            
+            log.info(f'Status: {experiment["status"]["conditions"][-1]["reason"]} {succeeded} trials of {self.jobsCount} succeeded')
+            log.debug(experiment["status"])
+            
+            sleep(2)
+            
+    
     def collect_benchmark_metrics(self):
         return super().collect_benchmark_metrics()
 
@@ -209,65 +239,21 @@ def get_experiment(self):
        
             # log.info(resource)
             # log.info(resource["status"])
+            return resource
         except ApiException as e:
             log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
-        return resource
+            return ""
 
 
     def collect_run_results(self):
         
         log.info("Collecting run results:")
-        config.load_kube_config()
-        w = watch.Watch()
-        c = client.CustomObjectsApi()
-        deployed = 0
-        
-        #TODO is there a way to run watch on namespaced custome object?
-        # for e in w.stream(c.get_namespaced_custom_object,
-        #                     group=self.group,
-        #                     version=self.version,
-        #                     namespace=self.namespace,
-        #                     name=self.study_name,
-        #                     plural=self.plural):
-        #     experiment = e["object"]
-        #     if "status" not in experiment:
-        #         log.info("Waitinng for the status")
-        #         sleep(5)
-        #     elif experiment["status"]["conditions"][-1]["type"]!="Succeeded":
-        #          log.info("chuja")
-        #          if "trialsSucceeded" in experiment["status"]:
-        #             log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
-        #             log.info(experiment["status"]["conditions"][-1])
-        #             sleep(2)
-        #     else:
-        #         log.info("\n Experiment finished with following optimal trial:")
-        #         log.info(experiment["status"]["currentOptimalTrial"])  
-                
-                
-           
-
-        
-        experiment = self.get_experiment()
-        while "status" not in experiment:
-            log.info("Waitinng for the status")
-            sleep(5)
-            experiment = self.get_experiment()
-            # log.info(experiment)
-        
-    
-        while experiment["status"]["conditions"][-1]["type"]!="Succeeded":
-            experiment = self.get_experiment()
-            log.info("\nWaiting for the experiment to finish:")
-            if "trialsSucceeded" in experiment["status"]:
-                log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded')
-            log.info(experiment["status"]["conditions"][-1])
-            sleep(2)
            
-       
+        experiment = self.get_experiment() 
         
-        log.info("\n Experiment finished with following optimal trial:")
-        log.info(experiment["status"]["currentOptimalTrial"])
-         
+        log.debug("\n Experiment finished with following optimal trial:")
+        log.debug(experiment["status"]["currentOptimalTrial"])
+        return experiment["status"]["currentOptimalTrial"] 
         
     
     def test(self):
@@ -313,9 +299,10 @@ def undeploy(self):
             log.info(err)
             log.info("Namespace sucessfully deleted")
             
-            log.info("Deleteing task docker image from minikube")
-            sleep(1)
-            self.image_builder.cleanup(self.trial_tag)
+            if self.clean_up:
+                log.info("Deleteing task docker image from minikube")
+                sleep(2)
+                self.image_builder.cleanup(self.trial_tag)
             log.info("Finished undeploying")
             return
         
@@ -333,9 +320,10 @@ def undeploy(self):
                     log.info(err)
                     log.info("Namespace sucessfully deleted")
                     
-                    log.info("Deleteing task docker image from minikube")
-                    sleep(1)
-                    self.image_builder.cleanup(self.trial_tag)
+                    if self.clean_up:
+                        log.info("Deleteing task docker image from minikube")
+                        sleep(2)
+                        self.image_builder.cleanup(self.trial_tag)
                     w.stop()
                     break
 
@@ -355,7 +343,8 @@ def undeploy(self):
         "dockerImageTag":"light_task",
         "workerCount":10,
         "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
-        "generateNewDockerImage": True
+        "generateNewDockerImage": True,
+        "cleanUp": True,
         })
     bench.deploy()
     bench.setup()

From a494587c07b13cd46b798d5bf8605af234cc7d15 Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Tue, 30 Aug 2022 17:35:03 +0200
Subject: [PATCH 8/9] 1. Clean up in katib benchmark. 2. fixed light_task.
 3.Latency tracker prints out the exception  witch acurres by latency
 recording

---
 .gitignore                                    |   1 +
 experiments/katib_minikube/grid.yaml          |  10 +-
 experiments/katib_minikube/katib_benchmark.py | 143 ++++++------------
 .../katib_minikube/light_task/Dockerfile      |   2 +-
 ml_benchmark/latency_tracker.py               |   5 +-
 5 files changed, 59 insertions(+), 102 deletions(-)

diff --git a/.gitignore b/.gitignore
index 89eb912..07d8d98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,5 +131,6 @@ dmypy.json
 exp__*
 experiments/simple_raytune/benchmark__RaytuneBenchmark
 experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
+experiments/katib_minikube/benchmark__KatibBenchmark
 
 data/
\ No newline at end of file
diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index d873f53..b9ae13c 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-48
+  name: katib-study-50
 spec:
   objective:
     type: maximize
@@ -11,9 +11,9 @@ spec:
     objectiveMetricName: precision
   algorithm:
     algorithmName: grid
-  parallelTrialCount: 10
-  maxTrialCount: 10
-  maxFailedTrialCount: 10
+  parallelTrialCount: 5
+  maxTrialCount: 5
+  maxFailedTrialCount: 5
   parameters:
     - name: lr
       parameterType: double
@@ -44,7 +44,7 @@ spec:
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-48"
+                    value: "katib-study-50"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index c265eb3..b821e7f 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -1,5 +1,6 @@
 from os import path
 import os
+from shutil import ExecError
 import sys
 from time import sleep
 from urllib.request import urlopen
@@ -20,60 +21,30 @@
 
 class KatibBenchmark(Benchmark):
 
-    def __init__(self, objective, grid, resources) -> None:
-        self.objective = objective
-        self.grid = grid
+    def __init__(self, resources) -> None:
         self.resources = resources
         self.group="kubeflow.org"
         self.version="v1beta1"
-        self.namespace='kubeflow'
+        self.namespace="kubeflow"
         self.plural="experiments"
         self.experiment_file_name = "grid.yaml"
         self.metrics_ip = resources.get("metricsIP")
         self.generate_new_docker_image = resources.get("generateNewDockerImage",True)     
-        config.load_kube_config()
-
-        self.logging_level= self.resources.get("loggingLevel",log.INFO)
         self.clean_up = self.resources.get("cleanUp",True)
+        self.trial_tag = resources.get("dockerImageTag","mnist_katib")
+        self.study_name= resources.get("studyName",f'katib-study-{random.randint(0, 100)}')
+        self.workerCpu = resources.get("workerCpu",2)
+        self.workerMemory= resources.get("workerMemory",2)
+        self.workerCount = resources.get("workerCount",5)
+        self.jobsCount = resources.get("jobsCount",6)
 
+        self.logging_level= self.resources.get("loggingLevel",log.INFO)
         log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level)
 
         
 
-    
-        if "dockerImageTag" in self.resources:
-            self.trial_tag = self.resources["dockerImageTag"]
-        else:
-            self.trial_tag = "mnist_katib"
-
-
-        
-
-
-        if "studyName" in self.resources:
-            self.study_name = self.resources["studyName"]
-        else:
-            self.study_name = f"katib-study-{random.randint(0, 100)}"
-
-        if "workerCpu" in self.resources:
-            self.workerCpu = self.resources["workerCpu"]
-        else:
-            self.workerCpu = 2
-
-        if "workerMemory" in resources:
-            self.workerMemory = self.resources["workerMemory"]
-        else:
-            self.workerMemory = 2
 
-        if "workerCount" in resources:
-            self.workerCount = self.resources["workerCount"]
-        else:
-            self.workerCount = 5
-
-        if "jobs_Count" in resources:
-            self.jobsCount = self.resources["jobs_Count"]
-        else:
-            self.jobsCount = 6
+    
     def deploy(self):
         """
             With the completion of this step the desired architecture of the HPO Framework should be running
@@ -109,13 +80,11 @@ def deploy(self):
                         log.info(f'{ob.metadata.name} is ready')
                         monitored_pods.remove(name)
                         deployed = deployed + 1
-                        log.info(monitored_pods)
 
                     # Checking for status of cert generator  
                     elif name == "katib-cert-generator" and ob.status.phase == "Succeeded": 
                         log.info(f'{ob.metadata.name} is Succeeded')
                         monitored_pods.remove(name)
-                        log.info(monitored_pods)
 
 
                     #if all monitored pods are running the deployment process was ended
@@ -200,7 +169,8 @@ def run(self):
         
         #Blocking untill the run is finished
         #The GET /apis/{group}/{version}/namespaces/{namespace}/{plural}/{name} endpoint doesnt support watch argument.
-
+       
+        #TODO changing to watching of the jobs instead of the experiment crd?
         experiment = self.get_experiment()
         while "status" not in experiment:
             log.info("Waitinng for the status")
@@ -237,8 +207,6 @@ def get_experiment(self):
                     plural=self.plural,
                 )
        
-            # log.info(resource)
-            # log.info(resource["status"])
             return resource
         except ApiException as e:
             log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
@@ -260,12 +228,12 @@ def test(self):
         return super().test()
 
     def undeploy(self):
+              
         log.info("Deleteing the experiment crd from the cluster")
-        # res = os.popen('kubectl delete -f grid.yaml').read()
-        # log.info(res)
         config.load_kube_config()
         api = client.CustomObjectsApi()
         try:
+            #deleting all experiment crds 
             resource = api.delete_collection_namespaced_custom_object(
                     group=self.group,
                     version=self.version,
@@ -273,59 +241,40 @@ def undeploy(self):
                     plural=self.plural,
                 )
             log.info(resource)
-            # log.info(resource)
-            # log.info(resource["status"])
+       
         except ApiException as e:
             log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e)
 
 
-        
-        
-        # log.info("Undeploying katib:")
-        # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read()
-        # log.info(res)
-
 
         w = watch.Watch()
         c = client.CoreV1Api()
         log.info("Deleteing the namespace:")
         res = c.delete_namespace_with_http_info(name=self.namespace)    
-
+        
 
         log.info("Checking status of the  namespace:")
-        try:
-            log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
-        except ApiException as err:
-            log.info(err)
-            log.info("Namespace sucessfully deleted")
-            
-            if self.clean_up:
-                log.info("Deleteing task docker image from minikube")
-                sleep(2)
-                self.image_builder.cleanup(self.trial_tag)
-            log.info("Finished undeploying")
-            return
-        
-      
-        log.info("Namespace still not termintated")
         #if the namespace was still existent we must wait till it is really terminated
         for e in w.stream(c.list_namespace):
             ob = e["object"]
             # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found
-            #TODO do this in other way            
             if ob.metadata.name == self.namespace:
                 try:
                     log.debug(c.read_namespace_status_with_http_info(name=self.namespace))
                 except ApiException as err:
-                    log.info(err)
-                    log.info("Namespace sucessfully deleted")
                     
                     if self.clean_up:
                         log.info("Deleteing task docker image from minikube")
                         sleep(2)
                         self.image_builder.cleanup(self.trial_tag)
-                    w.stop()
-                    break
+                    
+                    
+                    if(err.status != 404):
+                        raise Exception("Something went wrong",err)
+                    else: 
+                        log.info("Namespace sucessfully deleted")
+                        w.stop()
+
 
         log.info("Finished undeploying")
 
@@ -335,22 +284,28 @@ def undeploy(self):
 
 if __name__ == "__main__":
     #main()
-    bench = KatibBenchmark(1,1,resources={
-        # "dockerUserLogin":"",
-        # "dockerUserPassword":"",
-        # "studyName":""
-        "jobs_Count":10,
-        "dockerImageTag":"light_task",
-        "workerCount":10,
-        "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
-        "generateNewDockerImage": True,
-        "cleanUp": True,
-        })
-    bench.deploy()
-    bench.setup()
-    bench.run()
-    bench.collect_run_results()
-    bench.undeploy()
-
-
 
+    resources={
+            # "dockerUserLogin":"",
+            # "dockerUserPassword":"",
+            # "studyName":""
+            "jobsCount":5,
+            "dockerImageTag":"light_task",
+            "workerCount":5,
+            "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
+            "generateNewDockerImage": True,
+            "cleanUp": True ,
+            }
+    # bench = KatibBenchmark(resources=resources)
+    # bench.deploy()
+    # bench.setup()
+    # bench.run()
+    # bench.collect_run_results()
+    # bench.undeploy()
+
+
+
+    from ml_benchmark.benchmark_runner import BenchmarkRunner
+    runner = BenchmarkRunner(
+        benchmark_cls=KatibBenchmark, resources=resources)
+    runner.run()
\ No newline at end of file
diff --git a/experiments/katib_minikube/light_task/Dockerfile b/experiments/katib_minikube/light_task/Dockerfile
index b2bd6aa..8a05a7d 100644
--- a/experiments/katib_minikube/light_task/Dockerfile
+++ b/experiments/katib_minikube/light_task/Dockerfile
@@ -11,4 +11,4 @@ COPY ml_benchmark ml_benchmark
 
 RUN pip install -e .
 
-CMD ["python", "experiments/katib_minikube/task_light/task.py"]
\ No newline at end of file
+CMD ["python", "experiments/katib_minikube/light_task/task.py"]
\ No newline at end of file
diff --git a/ml_benchmark/latency_tracker.py b/ml_benchmark/latency_tracker.py
index 0f98094..f5929f8 100644
--- a/ml_benchmark/latency_tracker.py
+++ b/ml_benchmark/latency_tracker.py
@@ -61,8 +61,9 @@ def track(self, latency_obj):
                 stmt = insert(latency).values(latency_obj.to_dict())
                 conn.execute(stmt)
             print(f"Latency Recorded! Latency: {latency_obj.to_dict()}")
-        except Exception:
-            print(f"Failed to record latency: {latency_obj.to_dict()}")
+        except Exception as e:
+            print(f"Failed to record latency: {latency_obj.to_dict()}",e)
+            
 
     def _get_connection_string(self):
         # XXX: list order is implicitly a priority

From 45654579ecb5f3dad9d5c82a4ebe14066bebe5df Mon Sep 17 00:00:00 2001
From: witja46 <witja46@gmail.com>
Date: Wed, 31 Aug 2022 13:47:28 +0200
Subject: [PATCH 9/9] 1.Changed the mnist_task base image to lighter one.
 2.Katib_benchmark runs mnist_task per default.

---
 experiments/katib_minikube/grid.yaml             | 14 +++++++-------
 experiments/katib_minikube/katib_benchmark.py    |  9 ++++-----
 experiments/katib_minikube/mnist_task/Dockerfile |  9 ++++-----
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml
index b9ae13c..04c4f67 100644
--- a/experiments/katib_minikube/grid.yaml
+++ b/experiments/katib_minikube/grid.yaml
@@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1
 kind: Experiment
 metadata:
   namespace: kubeflow
-  name: katib-study-50
+  name: katib-study-32
 spec:
   objective:
     type: maximize
@@ -11,9 +11,9 @@ spec:
     objectiveMetricName: precision
   algorithm:
     algorithmName: grid
-  parallelTrialCount: 5
-  maxTrialCount: 5
-  maxFailedTrialCount: 5
+  parallelTrialCount: 2
+  maxTrialCount: 2
+  maxFailedTrialCount: 2
   parameters:
     - name: lr
       parameterType: double
@@ -35,16 +35,16 @@ spec:
           spec:
             containers:
               - name: training-container
-                image: light_task
+                image: mnist_task
                 imagePullPolicy: IfNotPresent
                 command:
                   - "python3"
-                  - "experiments/katib_minikube/light_task/task.py"
+                  - "experiments/katib_minikube/mnist_task/task.py"
                   - "--batch-size=64"
                   - "--lr=${trialParameters.learningRate}"
                 env:
                   - name: STUDY_NAME
-                    value: "katib-study-50"
+                    value: "katib-study-32"
                   - name: DB_CONN
                     value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
                   - name: "METRICS_STORAGE_HOST"
diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py
index b821e7f..d4016b3 100644
--- a/experiments/katib_minikube/katib_benchmark.py
+++ b/experiments/katib_minikube/katib_benchmark.py
@@ -31,7 +31,7 @@ def __init__(self, resources) -> None:
         self.metrics_ip = resources.get("metricsIP")
         self.generate_new_docker_image = resources.get("generateNewDockerImage",True)     
         self.clean_up = self.resources.get("cleanUp",True)
-        self.trial_tag = resources.get("dockerImageTag","mnist_katib")
+        self.trial_tag = resources.get("dockerImageTag","mnist_task")
         self.study_name= resources.get("studyName",f'katib-study-{random.randint(0, 100)}')
         self.workerCpu = resources.get("workerCpu",2)
         self.workerMemory= resources.get("workerMemory",2)
@@ -66,7 +66,6 @@ def deploy(self):
         log.info("Waiting for all Katib pods to be ready:")
         # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments 
         monitored_pods = ["katib-cert-generator","katib-db-manager","katib-mysql","katib-ui","katib-controller"]
-        # TODO changing to list_namespaced_deployments?
         for e in w.stream(c.list_namespaced_pod, namespace=self.namespace):
             ob = e["object"]          
 
@@ -289,9 +288,9 @@ def undeploy(self):
             # "dockerUserLogin":"",
             # "dockerUserPassword":"",
             # "studyName":""
-            "jobsCount":5,
-            "dockerImageTag":"light_task",
-            "workerCount":5,
+            "jobsCount":2,
+            # "dockerImageTag":"light_task",
+            "workerCount":2,
             "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(),
             "generateNewDockerImage": True,
             "cleanUp": True ,
diff --git a/experiments/katib_minikube/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task/Dockerfile
index 7e62b3b..0415797 100644
--- a/experiments/katib_minikube/mnist_task/Dockerfile
+++ b/experiments/katib_minikube/mnist_task/Dockerfile
@@ -1,11 +1,10 @@
-#TODO change to some slimer base image
-FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
-#FROM python:3.9-slim
+FROM python:3.9.6-slim
 
 RUN pip install pip --upgrade
+RUN pip install torch==1.10.2 torchvision==0.11.3
+RUN pip install optuna==2.10.1
 
-COPY experiments experiments
-# COPY data data
+COPY experiments/katib_minikube experiments/katib_minikube
 COPY setup.py setup.py
 COPY ml_benchmark ml_benchmark