From 1b3d5b87c9c908e3e8f7f04f4c0c61d872fefd62 Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 16 Aug 2022 17:29:54 +0200 Subject: [PATCH 1/9] First Katib script version --- .../katib_benchmark/experiment_template.yaml | 44 ++++ experiments/katib_benchmark/grid.yaml | 44 ++++ .../katib_benchmark/katib_benchmark.py | 245 ++++++++++++++++++ .../katib_benchmark/mnist_task/Dockerfile | 15 ++ .../katib_benchmark/mnist_task/README.md | 11 + .../mnist_task/requirements.txt | 2 + .../katib_benchmark/mnist_task/task.py | 58 +++++ 7 files changed, 419 insertions(+) create mode 100644 experiments/katib_benchmark/experiment_template.yaml create mode 100644 experiments/katib_benchmark/grid.yaml create mode 100644 experiments/katib_benchmark/katib_benchmark.py create mode 100644 experiments/katib_benchmark/mnist_task/Dockerfile create mode 100644 experiments/katib_benchmark/mnist_task/README.md create mode 100644 experiments/katib_benchmark/mnist_task/requirements.txt create mode 100644 experiments/katib_benchmark/mnist_task/task.py diff --git a/experiments/katib_benchmark/experiment_template.yaml b/experiments/katib_benchmark/experiment_template.yaml new file mode 100644 index 0000000..3b576d1 --- /dev/null +++ b/experiments/katib_benchmark/experiment_template.yaml @@ -0,0 +1,44 @@ +--- +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: kubeflow + name: $study_name +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + algorithm: + algorithmName: grid + parallelTrialCount: $worker_num + maxTrialCount: $jobs_num + maxFailedTrialCount: $worker_num + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.001" + max: "0.02" + step: "0.0001" + trialTemplate: + primaryContainerName: training-container + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: $worker_image + command: + - "python3" + - "/opt/mnist_task/task.py" + - "--batch-size=64" + - "--lr=$trialParameters" + restartPolicy: Never diff --git a/experiments/katib_benchmark/grid.yaml b/experiments/katib_benchmark/grid.yaml new file mode 100644 index 0000000..e39d6cf --- /dev/null +++ b/experiments/katib_benchmark/grid.yaml @@ -0,0 +1,44 @@ +--- +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: kubeflow + name: katib-study-79 +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + algorithm: + algorithmName: grid + parallelTrialCount: 20 + maxTrialCount: 20 + maxFailedTrialCount: 20 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.001" + max: "0.02" + step: "0.0001" + trialTemplate: + primaryContainerName: training-container + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: witja46/mnist_katib:latest + command: + - "python3" + - "/opt/mnist_task/task.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + restartPolicy: Never diff --git a/experiments/katib_benchmark/katib_benchmark.py b/experiments/katib_benchmark/katib_benchmark.py new file mode 100644 index 0000000..b1381d9 --- /dev/null +++ b/experiments/katib_benchmark/katib_benchmark.py @@ -0,0 +1,245 @@ +from os import path +import os +import sys +from time import sleep +from kubernetes.client.rest import ApiException +import random +from kubernetes import client, config +from string import Template +import yaml +import docker +PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../..")) +sys.path.append(PROJECT_ROOT) +from ml_benchmark.benchmark_runner import Benchmark + + + + +class KatibBenchmark(Benchmark): + + def __init__(self, objective, grid, resources) -> None: + self.objective = objective + self.grid = grid + self.resources = resources + self.group="kubeflow.org" + self.version="v1beta1" + self.namespace='kubeflow' + self.plural="experiments" + self.experiment_file_name = "grid.yaml" + + config.load_kube_config() + + + + + if "dockerImageTag" in self.resources: + self.trial_tag = self.resources["dockerImageTag"] + else: + self.trial_tag = "mnist_katib:latest" + + if "dockerUserLogin" in self.resources: + self.docker_user = self.resources["dockerUserLogin"] + self.trial_tag = f'{self.docker_user}/{self.trial_tag}' + else: + self.docker_user = "" + self.trial_tag = "witja46/mnist_katib:latest" + + if "dockerUserPassword" in self.resources: + self.docker_pasword = self.resources["dockerUserPassword"] + else: + self.docker_pasword = "" + + + if "studyName" in self.resources: + self.study_name = self.resources["studyName"] + else: + self.study_name = f"katib-study-{random.randint(0, 100)}" + + if "workerCpu" in self.resources: + self.workerCpu = self.resources["workerCpu"] + else: + self.workerCpu = 2 + + if "workerMemory" in resources: + self.workerMemory = self.resources["workerMemory"] + else: + self.workerMemory = 2 + + if "workerCount" in resources: + self.workerCount = self.resources["workerCount"] + else: + self.workerCount = 5 + + if "jobs_Count" in resources: + self.jobsCount = self.resources["jobs_Count"] + else: + self.jobsCount = 6 + def deploy(self): + """ + With the completion of this step the desired architecture of the HPO Framework should be running + on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods + and services in kubernetes. + """ + print("Deploying katib:") + res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master').read() + print(res) + + + #TODO waiit untill all pods are running + + + + + + def setup(self): + """ + Every Operation that is needed before the actual optimization (trial) starts and that is not relevant + for starting up workers or the necessary architecture. + """ + + #creating experiment yaml + + experiment_definition = { + "worker_num": self.workerCount, + "jobs_num":self.jobsCount, + "worker_cpu": self.workerCpu, + "worker_mem": f"{self.workerMemory}Gi", + "worker_image": self.trial_tag, + "study_name": self.study_name, + "trialParameters":"${trialParameters.learningRate}" + } + + #loading and fulling the template + with open(path.join(path.dirname(__file__), "experiment_template.yaml"), "r") as f: + job_template = Template(f.read()) + job_yml_objects = job_template.substitute(experiment_definition) + + #writing the experiment definition into the file + with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f: + f.write(job_yml_objects) + print("Experiment yaml created") + + + # Creating new docker image if credentials were passed + if "dockerUserLogin" in self.resources: + + #creating task docker image + print("Creating task docker image") + self.client = docker.client.from_env() + image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag) + print(f"Image: {self.trial_tag}") + for line in logs : + print(line) + + #pushing to repo + self.client.login(username=self.docker_user, password=self.docker_pasword) + for line in self.client.images.push(self.trial_tag, stream=True, decode=True): + print(line) + + + + def run(self): + + print("Starting Katib experiment:") + api_instance= client.CustomObjectsApi() + + #Loading experiment definition + with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f: + body = yaml.safe_load(f) + + #Starting experiment by creating experiment crd with help of kubernetes API + try: + api_response = api_instance.create_namespaced_custom_object( + group=self.group, + version=self.version, + namespace=self.namespace, + body=body, + plural=self.plural, + ) + print("Succses: Experiment started") + # print(api_response) + except ApiException as e: + print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e) + + + def collect_benchmark_metrics(self): + return super().collect_benchmark_metrics() + + + + def get_experiment(self): + config.load_kube_config() + api = client.CustomObjectsApi() + #requesting experiments crd that contains data about finisched experiment + try: + resource = api.get_namespaced_custom_object_status( + group=self.group, + version=self.version, + namespace=self.namespace, + name=self.study_name, + plural=self.plural, + ) + + # print(resource) + # print(resource["status"]) + except ApiException as e: + print("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) + return resource + + + def collect_run_results(self): + + print("Collecting run results:") + + sleep(5) + experiment = self.get_experiment() + while "status" not in experiment: + print("Waitinng for the status") + sleep(5) + experiment = self.get_experiment() + print(experiment) + + + while experiment["status"]["conditions"][-1]["type"]!="Succeeded": + experiment = self.get_experiment() + print("\nWaiting for the experiment to finish:") + if "trialsSucceeded" in experiment["status"]: + print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') + print(experiment["status"]["conditions"][-1]) + sleep(2) + + + + print("\n Experiment finished with following optimal trial:") + print(experiment["status"]["currentOptimalTrial"]) + + + + def test(self): + return super().test() + + def undeploy(self): + return super().undeploy() + + def main(): + print("jeb") + + + + +if __name__ == "__main__": + #main() + bench = KatibBenchmark(1,1,resources={ + # "dockerUserLogin":"", + # "dockerUserPassword":"", + # "studyName":"" + "jobs_Count":20, + "workerCount":20 + }) + # bench.deploy() + bench.setup() + bench.run() + bench.collect_run_results() + + + diff --git a/experiments/katib_benchmark/mnist_task/Dockerfile b/experiments/katib_benchmark/mnist_task/Dockerfile new file mode 100644 index 0000000..1cbfd5b --- /dev/null +++ b/experiments/katib_benchmark/mnist_task/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.9-slim + +ADD . /opt/mnist_task +WORKDIR /opt/mnist_task + +# Add folder for the logs. +RUN mkdir /katib +RUN pip install --no-cache-dir -r requirements.txt + +RUN chgrp -R 0 /opt/mnist_task \ + && chmod -R g+rwX /opt/mnist_task \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/mnist_task/task.py"] diff --git a/experiments/katib_benchmark/mnist_task/README.md b/experiments/katib_benchmark/mnist_task/README.md new file mode 100644 index 0000000..3326bc7 --- /dev/null +++ b/experiments/katib_benchmark/mnist_task/README.md @@ -0,0 +1,11 @@ +# PyTorch MNIST Image Classification Example + +This is PyTorch MNIST image classification training container with saving metrics +to the file or printing to the StdOut. It uses convolutional neural network to +train the model. + +Katib uses this training container in some Experiments, for instance in the +[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64), +the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62), +the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71) +and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54). diff --git a/experiments/katib_benchmark/mnist_task/requirements.txt b/experiments/katib_benchmark/mnist_task/requirements.txt new file mode 100644 index 0000000..9852601 --- /dev/null +++ b/experiments/katib_benchmark/mnist_task/requirements.txt @@ -0,0 +1,2 @@ +Pillow>=9.1.1 +numpy \ No newline at end of file diff --git a/experiments/katib_benchmark/mnist_task/task.py b/experiments/katib_benchmark/mnist_task/task.py new file mode 100644 index 0000000..a8dcc0c --- /dev/null +++ b/experiments/katib_benchmark/mnist_task/task.py @@ -0,0 +1,58 @@ +from __future__ import print_function + +import argparse +import logging +import os +import numpy as np +import time + +def train(times ,epoch): + loss = 1 + for x in np.arange(1,times + 1): + loss = 1 - (x -1 ) / times + time.sleep(0.01) + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, x , times, + 100. * x / times, loss) + logging.info(msg) + + +def test(): + + test_accuracy =0.8 + logging.info("Validation-accuracy={:.4f}\n".format( + test_accuracy)) + + + +def main(): + + #parsing arguments + parser = argparse.ArgumentParser(description="MNIST Example") + parser.add_argument("--batch-size", type=int, default=64, metavar="N", + help="input batch size for training (default: 64)") + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--lr", type=float, default=0.01, metavar="LR", + help="learning rate (default: 0.01)") + args = parser.parse_args() + + #timestamps format + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG) + + + #model taining and testing + epochs = args.epochs + batch_size = args.batch_size + lr = args.lr + for epoch in np.arange(1,epochs+1): + train(batch_size,epoch) + test() + + + +if __name__ == "__main__": + main() From 4adc75006247cf73e8d284ff5e222bd3fada4488 Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 16 Aug 2022 23:33:24 +0200 Subject: [PATCH 2/9] 1.Added blocking in deploy phase 2.Docker image is now created in minikube. --- .../experiment_template.yaml | 16 +++- .../grid.yaml | 26 ++++-- .../katib_benchmark.py | 92 ++++++++++++------- .../katib_minikube/mnist_task/Dockerfile | 16 ++++ .../mnist_task/README.md | 0 .../mnist_task/requirements.txt | 0 experiments/katib_minikube/mnist_task/task.py | 78 ++++++++++++++++ .../mnist_task_old}/Dockerfile | 0 .../katib_minikube/mnist_task_old/README.md | 11 +++ .../mnist_task_old/requirements.txt | 2 + .../mnist_task_old}/task.py | 0 experiments/katib_minikube/requirements.txt | 2 + ml_benchmark/__init__.py | 2 +- 13 files changed, 198 insertions(+), 47 deletions(-) rename experiments/{katib_benchmark => katib_minikube}/experiment_template.yaml (65%) rename experiments/{katib_benchmark => katib_minikube}/grid.yaml (56%) rename experiments/{katib_benchmark => katib_minikube}/katib_benchmark.py (70%) create mode 100644 experiments/katib_minikube/mnist_task/Dockerfile rename experiments/{katib_benchmark => katib_minikube}/mnist_task/README.md (100%) rename experiments/{katib_benchmark => katib_minikube}/mnist_task/requirements.txt (100%) create mode 100644 experiments/katib_minikube/mnist_task/task.py rename experiments/{katib_benchmark/mnist_task => katib_minikube/mnist_task_old}/Dockerfile (100%) create mode 100644 experiments/katib_minikube/mnist_task_old/README.md create mode 100644 experiments/katib_minikube/mnist_task_old/requirements.txt rename experiments/{katib_benchmark/mnist_task => katib_minikube/mnist_task_old}/task.py (100%) create mode 100644 experiments/katib_minikube/requirements.txt diff --git a/experiments/katib_benchmark/experiment_template.yaml b/experiments/katib_minikube/experiment_template.yaml similarity index 65% rename from experiments/katib_benchmark/experiment_template.yaml rename to experiments/katib_minikube/experiment_template.yaml index 3b576d1..e733d32 100644 --- a/experiments/katib_benchmark/experiment_template.yaml +++ b/experiments/katib_minikube/experiment_template.yaml @@ -7,8 +7,8 @@ metadata: spec: objective: type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy + goal: 0.8 + objectiveMetricName: precision algorithm: algorithmName: grid parallelTrialCount: $worker_num @@ -20,7 +20,7 @@ spec: feasibleSpace: min: "0.001" max: "0.02" - step: "0.0001" + step: "0.001" trialTemplate: primaryContainerName: training-container trialParameters: @@ -36,9 +36,17 @@ spec: containers: - name: training-container image: $worker_image + imagePullPolicy: IfNotPresent command: - "python3" - - "/opt/mnist_task/task.py" + - "experiments/katib_minikube/mnist_task/task.py" - "--batch-size=64" - "--lr=$trialParameters" + env: + - name: STUDY_NAME + value: "$study_name" + - name: DB_CONN + value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" + - name: "METRICS_STORAGE_HOST" + value: "$metrics_ip" restartPolicy: Never diff --git a/experiments/katib_benchmark/grid.yaml b/experiments/katib_minikube/grid.yaml similarity index 56% rename from experiments/katib_benchmark/grid.yaml rename to experiments/katib_minikube/grid.yaml index e39d6cf..c37e5a0 100644 --- a/experiments/katib_benchmark/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,24 +3,24 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-79 + name: katib-study-74 spec: objective: type: maximize - goal: 0.99 - objectiveMetricName: Validation-accuracy + goal: 0.8 + objectiveMetricName: precision algorithm: algorithmName: grid - parallelTrialCount: 20 - maxTrialCount: 20 - maxFailedTrialCount: 20 + parallelTrialCount: 4 + maxTrialCount: 4 + maxFailedTrialCount: 4 parameters: - name: lr parameterType: double feasibleSpace: min: "0.001" max: "0.02" - step: "0.0001" + step: "0.001" trialTemplate: primaryContainerName: training-container trialParameters: @@ -35,10 +35,18 @@ spec: spec: containers: - name: training-container - image: witja46/mnist_katib:latest + image: mnist_katib + imagePullPolicy: IfNotPresent command: - "python3" - - "/opt/mnist_task/task.py" + - "experiments/katib_minikube/mnist_task/task.py" - "--batch-size=64" - "--lr=${trialParameters.learningRate}" + env: + - name: STUDY_NAME + value: "katib-study-74" + - name: DB_CONN + value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" + - name: "METRICS_STORAGE_HOST" + value: "134.101.4.161" restartPolicy: Never diff --git a/experiments/katib_benchmark/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py similarity index 70% rename from experiments/katib_benchmark/katib_benchmark.py rename to experiments/katib_minikube/katib_benchmark.py index b1381d9..56131f7 100644 --- a/experiments/katib_benchmark/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -2,15 +2,18 @@ import os import sys from time import sleep +from urllib.request import urlopen from kubernetes.client.rest import ApiException import random -from kubernetes import client, config +from kubernetes import client, config, watch +from kubernetes.client.rest import ApiException from string import Template import yaml import docker -PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../..")) -sys.path.append(PROJECT_ROOT) +import logging as log + from ml_benchmark.benchmark_runner import Benchmark +from ml_benchmark.utils.image_build_wrapper import builder_from_string @@ -26,28 +29,23 @@ def __init__(self, objective, grid, resources) -> None: self.namespace='kubeflow' self.plural="experiments" self.experiment_file_name = "grid.yaml" - + self.metrics_ip = resources.get("metricsIP") config.load_kube_config() + self.logging_level= self.resources.get("loggingLevel",log.INFO) + + log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level) + if "dockerImageTag" in self.resources: self.trial_tag = self.resources["dockerImageTag"] else: - self.trial_tag = "mnist_katib:latest" + self.trial_tag = "mnist_katib" + - if "dockerUserLogin" in self.resources: - self.docker_user = self.resources["dockerUserLogin"] - self.trial_tag = f'{self.docker_user}/{self.trial_tag}' - else: - self.docker_user = "" - self.trial_tag = "witja46/mnist_katib:latest" - if "dockerUserPassword" in self.resources: - self.docker_pasword = self.resources["dockerUserPassword"] - else: - self.docker_pasword = "" if "studyName" in self.resources: @@ -85,6 +83,38 @@ def deploy(self): print(res) + + config.load_kube_config() + w = watch.Watch() + c = client.CoreV1Api() + deployed = 0 + log.info("Waiting for all Katib pods to be ready:") + # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments + monitored_pods = ["katib-cert-generator","katib-db-manager","katib-mysql","katib-ui","katib-controller"] + # TODO changing to list_namespaced_deployments? + for e in w.stream(c.list_namespaced_pod, namespace=self.namespace): + ob = e["object"] + + for name in monitored_pods: + + #checking if it is one of the pods that we want to monitor + if name in ob.metadata.name: + + # Checking if the pod already is runnig and its underlying containers are ready + if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: + log.info(f'{ob.metadata.name} is ready') + monitored_pods.remove(name) + deployed = deployed + 1 + + #if all monitored pods are running the deployment process was ended + if(deployed == 4 ): + w.stop() + log.info("Finished deploying crucial pods") + + + + + #TODO waiit untill all pods are running @@ -106,7 +136,8 @@ def setup(self): "worker_mem": f"{self.workerMemory}Gi", "worker_image": self.trial_tag, "study_name": self.study_name, - "trialParameters":"${trialParameters.learningRate}" + "trialParameters":"${trialParameters.learningRate}", + "metrics_ip":self.metrics_ip } #loading and fulling the template @@ -120,21 +151,15 @@ def setup(self): print("Experiment yaml created") - # Creating new docker image if credentials were passed - if "dockerUserLogin" in self.resources: - - #creating task docker image - print("Creating task docker image") - self.client = docker.client.from_env() - image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag) - print(f"Image: {self.trial_tag}") - for line in logs : - print(line) - - #pushing to repo - self.client.login(username=self.docker_user, password=self.docker_pasword) - for line in self.client.images.push(self.trial_tag, stream=True, decode=True): - print(line) + # Creating new docker image if credentials were passed + log.info("Creating task docker image") + #creating docker image inside of the minikube + self.image_builder = builder_from_string("minikube")() + PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) + res = self.image_builder.deploy_image( + "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) + print(res) + print(f"Image: {self.trial_tag}") @@ -233,8 +258,9 @@ def main(): # "dockerUserLogin":"", # "dockerUserPassword":"", # "studyName":"" - "jobs_Count":20, - "workerCount":20 + "jobs_Count":4, + "workerCount":4, + "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() }) # bench.deploy() bench.setup() diff --git a/experiments/katib_minikube/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task/Dockerfile new file mode 100644 index 0000000..7e62b3b --- /dev/null +++ b/experiments/katib_minikube/mnist_task/Dockerfile @@ -0,0 +1,16 @@ +#TODO change to some slimer base image +FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime +#FROM python:3.9-slim + +RUN pip install pip --upgrade + +COPY experiments experiments +# COPY data data +COPY setup.py setup.py +COPY ml_benchmark ml_benchmark + + + +RUN pip install -e . + +CMD ["python", "experiments/katib_minikube/mnist_task/task.py"] diff --git a/experiments/katib_benchmark/mnist_task/README.md b/experiments/katib_minikube/mnist_task/README.md similarity index 100% rename from experiments/katib_benchmark/mnist_task/README.md rename to experiments/katib_minikube/mnist_task/README.md diff --git a/experiments/katib_benchmark/mnist_task/requirements.txt b/experiments/katib_minikube/mnist_task/requirements.txt similarity index 100% rename from experiments/katib_benchmark/mnist_task/requirements.txt rename to experiments/katib_minikube/mnist_task/requirements.txt diff --git a/experiments/katib_minikube/mnist_task/task.py b/experiments/katib_minikube/mnist_task/task.py new file mode 100644 index 0000000..2d3af82 --- /dev/null +++ b/experiments/katib_minikube/mnist_task/task.py @@ -0,0 +1,78 @@ +import argparse +import logging +import numpy as np +import time +import os +import sys +PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../")) +sys.path.append(PROJECT_ROOT) +from ml_benchmark.workload.mnist.mnist_task import MnistTask + + +def train(times ,epoch): + loss = 1 + for x in np.arange(1,times + 1): + loss = 1 - (x -1 ) / times + time.sleep(0.1) + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, x , times, + 100. * x / times, loss) + logging.info(msg) + + +def test(): + + test_accuracy =0.8 + logging.info("Validation-accuracy={:.4f}\n".format( + test_accuracy)) + + + +def main(): + + + #parsing arguments + parser = argparse.ArgumentParser(description="MNIST Example") + parser.add_argument("--batch-size", type=int, default=64, metavar="N", + help="input batch size for training (default: 64)") + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--lr", type=float, default=0.01, metavar="LR", + help="learning rate (default: 0.01)") + args = parser.parse_args() + epochs = args.epochs + batch_size = args.batch_size + lr = args.lr + + + + + + + #MnistTask + task = MnistTask(config_init={"epochs": epochs}) + objective = task.create_objective() + #TODO add the weight decay to the definition of the template + objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01}) + objective.train() + validation_scores = objective.validate() + + + #Geting results + avg = validation_scores["weighted avg"] + print(f'precision={avg["precision"]}') + # print(f'f1-score={avg["f1-score"]}') + + # logging.basicConfig( + # format="%(asctime)s %(levelname)-8s %(message)s", + # datefmt="%Y-%m-%dT%H:%M:%SZ", + # level=logging.DEBUG) + + + # logging.info("precision={:.4f}\n".format( + # avg["precision"])) + + # logging.info("f1-score={:.4f}\n".format( + # avg["f1-score"])) +if __name__ == "__main__": + main() diff --git a/experiments/katib_benchmark/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task_old/Dockerfile similarity index 100% rename from experiments/katib_benchmark/mnist_task/Dockerfile rename to experiments/katib_minikube/mnist_task_old/Dockerfile diff --git a/experiments/katib_minikube/mnist_task_old/README.md b/experiments/katib_minikube/mnist_task_old/README.md new file mode 100644 index 0000000..3326bc7 --- /dev/null +++ b/experiments/katib_minikube/mnist_task_old/README.md @@ -0,0 +1,11 @@ +# PyTorch MNIST Image Classification Example + +This is PyTorch MNIST image classification training container with saving metrics +to the file or printing to the StdOut. It uses convolutional neural network to +train the model. + +Katib uses this training container in some Experiments, for instance in the +[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64), +the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62), +the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71) +and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54). diff --git a/experiments/katib_minikube/mnist_task_old/requirements.txt b/experiments/katib_minikube/mnist_task_old/requirements.txt new file mode 100644 index 0000000..9852601 --- /dev/null +++ b/experiments/katib_minikube/mnist_task_old/requirements.txt @@ -0,0 +1,2 @@ +Pillow>=9.1.1 +numpy \ No newline at end of file diff --git a/experiments/katib_benchmark/mnist_task/task.py b/experiments/katib_minikube/mnist_task_old/task.py similarity index 100% rename from experiments/katib_benchmark/mnist_task/task.py rename to experiments/katib_minikube/mnist_task_old/task.py diff --git a/experiments/katib_minikube/requirements.txt b/experiments/katib_minikube/requirements.txt new file mode 100644 index 0000000..8ef8413 --- /dev/null +++ b/experiments/katib_minikube/requirements.txt @@ -0,0 +1,2 @@ +kubernetes==24.2.0 +torch \ No newline at end of file diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index f628f98..b28a677 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -1,7 +1,7 @@ __version__ = "develop" install_requires = [ "scikit-learn==0.24.2", - "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.4.2", + "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2", "psycopg2-binary"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" From 0ebbeec34657cbbd4aace83f3593241b01b75a43 Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 17 Aug 2022 12:40:26 +0200 Subject: [PATCH 3/9] Added undeploy with blocking. --- experiments/katib_minikube/grid.yaml | 4 +- experiments/katib_minikube/katib_benchmark.py | 105 ++++++++++++++---- ml_benchmark/__init__.py | 2 +- 3 files changed, 89 insertions(+), 22 deletions(-) diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index c37e5a0..a472b10 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-74 + name: katib-study-5 spec: objective: type: maximize @@ -44,7 +44,7 @@ spec: - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-74" + value: "katib-study-5" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index 56131f7..34bbbc6 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -29,7 +29,8 @@ def __init__(self, objective, grid, resources) -> None: self.namespace='kubeflow' self.plural="experiments" self.experiment_file_name = "grid.yaml" - self.metrics_ip = resources.get("metricsIP") + self.metrics_ip = resources.get("metricsIP") + self.generate_new_docker_image = resources.get("generateNewDockerImage",True) config.load_kube_config() self.logging_level= self.resources.get("loggingLevel",log.INFO) @@ -79,7 +80,7 @@ def deploy(self): and services in kubernetes. """ print("Deploying katib:") - res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master').read() + res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() print(res) @@ -151,15 +152,16 @@ def setup(self): print("Experiment yaml created") - # Creating new docker image if credentials were passed - log.info("Creating task docker image") - #creating docker image inside of the minikube - self.image_builder = builder_from_string("minikube")() - PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) - res = self.image_builder.deploy_image( - "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) - print(res) - print(f"Image: {self.trial_tag}") + #only generating the docker image if specified so. + if self.generate_new_docker_image: + log.info("Creating task docker image") + #creating docker image inside of the minikube + self.image_builder = builder_from_string("minikube")() + PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) + res = self.image_builder.deploy_image( + "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) + print(res) + print(f"Image: {self.trial_tag}") @@ -182,7 +184,7 @@ def run(self): plural=self.plural, ) print("Succses: Experiment started") - # print(api_response) + print(api_response) except ApiException as e: print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e) @@ -215,14 +217,42 @@ def get_experiment(self): def collect_run_results(self): print("Collecting run results:") + config.load_kube_config() + w = watch.Watch() + c = client.CustomObjectsApi() + deployed = 0 + + #TODO is there a way to run watch on namespaced custome object? + # for e in w.stream(c.get_namespaced_custom_object_with_http_info, + # group=self.group, + # version=self.version, + # namespace=self.namespace, + # name=self.study_name, + # plural=self.plural): + # experiment = e["object"] + # if "status" not in experiment: + # print("Waitinng for the status") + # sleep(5) + # elif experiment["status"]["conditions"][-1]["type"]!="Succeeded": + # print("chuja") + # if "trialsSucceeded" in experiment["status"]: + # print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') + # print(experiment["status"]["conditions"][-1]) + # sleep(2) + # else: + # print("\n Experiment finished with following optimal trial:") + # print(experiment["status"]["currentOptimalTrial"]) + + + + - sleep(5) experiment = self.get_experiment() while "status" not in experiment: print("Waitinng for the status") sleep(5) experiment = self.get_experiment() - print(experiment) + # print(experiment) while experiment["status"]["conditions"][-1]["type"]!="Succeeded": @@ -244,7 +274,42 @@ def test(self): return super().test() def undeploy(self): - return super().undeploy() + print("Undeploying katib:") + # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"') + + + + w = watch.Watch() + c = client.CoreV1Api() + log.info("Deleteing the namespace:") + res = c.delete_namespace_with_http_info(name=self.namespace) + + + log.info("Checking status of the namespace:") + try: + log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) + except ApiException as err: + log.info(err) + log.info("Namespace sucessfully deleted") + log.info("Finished undeploying") + return + + + #if the namespace was still existent we must wait till it is really terminated + for e in w.stream(c.list_namespace): + ob = e["object"] + # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found + #TODO do this in other way + if ob.metadata.name == self.namespace: + try: + log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) + except ApiException as err: + log.info(err) + log.info("Namespace sucessfully deleted") + w.stop() + break + + log.info("Finished undeploying") def main(): print("jeb") @@ -258,14 +323,16 @@ def main(): # "dockerUserLogin":"", # "dockerUserPassword":"", # "studyName":"" - "jobs_Count":4, - "workerCount":4, - "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() + "jobs_Count":10, + "workerCount":10, + "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), + "generateNewDockerImage": True }) - # bench.deploy() + bench.deploy() bench.setup() bench.run() bench.collect_run_results() + bench.undeploy() diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index b28a677..2742851 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -1,7 +1,7 @@ __version__ = "develop" install_requires = [ "scikit-learn==0.24.2", - "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2", + "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4..2", "psycopg2-binary"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" From 41c48b054acdf560e5eb3b7fef46ddc6f5621110 Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 17 Aug 2022 12:58:58 +0200 Subject: [PATCH 4/9] Switched to logging, undeploying with kubectl delete --- experiments/katib_minikube/grid.yaml | 10 ++-- experiments/katib_minikube/katib_benchmark.py | 58 +++++++++---------- 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index a472b10..df91f9a 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-5 + name: katib-study-22 spec: objective: type: maximize @@ -11,9 +11,9 @@ spec: objectiveMetricName: precision algorithm: algorithmName: grid - parallelTrialCount: 4 - maxTrialCount: 4 - maxFailedTrialCount: 4 + parallelTrialCount: 10 + maxTrialCount: 10 + maxFailedTrialCount: 10 parameters: - name: lr parameterType: double @@ -44,7 +44,7 @@ spec: - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-5" + value: "katib-study-22" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index 34bbbc6..1009a24 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -79,9 +79,9 @@ def deploy(self): on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods and services in kubernetes. """ - print("Deploying katib:") + log.info("Deploying katib:") res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() - print(res) + log.info(res) @@ -149,7 +149,7 @@ def setup(self): #writing the experiment definition into the file with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f: f.write(job_yml_objects) - print("Experiment yaml created") + log.info("Experiment yaml created") #only generating the docker image if specified so. @@ -160,14 +160,14 @@ def setup(self): PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) res = self.image_builder.deploy_image( "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) - print(res) - print(f"Image: {self.trial_tag}") + log.info(res) + log.info(f"Image: {self.trial_tag}") def run(self): - print("Starting Katib experiment:") + log.info("Starting Katib experiment:") api_instance= client.CustomObjectsApi() #Loading experiment definition @@ -183,10 +183,10 @@ def run(self): body=body, plural=self.plural, ) - print("Succses: Experiment started") - print(api_response) + log.info("Succses: Experiment started") + log.info(api_response) except ApiException as e: - print("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e) + log.info("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e) def collect_benchmark_metrics(self): @@ -207,16 +207,16 @@ def get_experiment(self): plural=self.plural, ) - # print(resource) - # print(resource["status"]) + # log.info(resource) + # log.info(resource["status"]) except ApiException as e: - print("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) + log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) return resource def collect_run_results(self): - print("Collecting run results:") + log.info("Collecting run results:") config.load_kube_config() w = watch.Watch() c = client.CustomObjectsApi() @@ -231,17 +231,17 @@ def collect_run_results(self): # plural=self.plural): # experiment = e["object"] # if "status" not in experiment: - # print("Waitinng for the status") + # log.info("Waitinng for the status") # sleep(5) # elif experiment["status"]["conditions"][-1]["type"]!="Succeeded": - # print("chuja") + # log.info("chuja") # if "trialsSucceeded" in experiment["status"]: - # print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') - # print(experiment["status"]["conditions"][-1]) + # log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') + # log.info(experiment["status"]["conditions"][-1]) # sleep(2) # else: - # print("\n Experiment finished with following optimal trial:") - # print(experiment["status"]["currentOptimalTrial"]) + # log.info("\n Experiment finished with following optimal trial:") + # log.info(experiment["status"]["currentOptimalTrial"]) @@ -249,24 +249,24 @@ def collect_run_results(self): experiment = self.get_experiment() while "status" not in experiment: - print("Waitinng for the status") + log.info("Waitinng for the status") sleep(5) experiment = self.get_experiment() - # print(experiment) + # log.info(experiment) while experiment["status"]["conditions"][-1]["type"]!="Succeeded": experiment = self.get_experiment() - print("\nWaiting for the experiment to finish:") + log.info("\nWaiting for the experiment to finish:") if "trialsSucceeded" in experiment["status"]: - print(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') - print(experiment["status"]["conditions"][-1]) + log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') + log.info(experiment["status"]["conditions"][-1]) sleep(2) - print("\n Experiment finished with following optimal trial:") - print(experiment["status"]["currentOptimalTrial"]) + log.info("\n Experiment finished with following optimal trial:") + log.info(experiment["status"]["currentOptimalTrial"]) @@ -274,8 +274,8 @@ def test(self): return super().test() def undeploy(self): - print("Undeploying katib:") - # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"') + log.info("Undeploying katib:") + res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"') @@ -311,8 +311,6 @@ def undeploy(self): log.info("Finished undeploying") - def main(): - print("jeb") From 3940d11896d54d20f0b127dca1a32d4124c59f6e Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 17 Aug 2022 13:20:29 +0200 Subject: [PATCH 5/9] Fixes in undeploy --- experiments/katib_minikube/katib_benchmark.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index 1009a24..f96ecea 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -275,14 +275,14 @@ def test(self): def undeploy(self): log.info("Undeploying katib:") - res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"') - + res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() + log.info(res) w = watch.Watch() c = client.CoreV1Api() - log.info("Deleteing the namespace:") - res = c.delete_namespace_with_http_info(name=self.namespace) + # log.info("Deleteing the namespace:") + # res = c.delete_namespace_with_http_info(name=self.namespace) log.info("Checking status of the namespace:") @@ -326,10 +326,10 @@ def undeploy(self): "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), "generateNewDockerImage": True }) - bench.deploy() - bench.setup() - bench.run() - bench.collect_run_results() + # bench.deploy() + # bench.setup() + # bench.run() + # bench.collect_run_results() bench.undeploy() From 0c7f6e31284fc527b08f4fa86ea1900ef1e106a4 Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 23 Aug 2022 12:47:17 +0200 Subject: [PATCH 6/9] 1. Undeploy works now. 2. Small changes in light_task --- .../katib_minikube/experiment_template.yaml | 2 +- experiments/katib_minikube/grid.yaml | 8 +-- experiments/katib_minikube/katib_benchmark.py | 53 +++++++++++++++---- .../katib_minikube/light_task/Dockerfile | 14 +++++ .../{mnist_task_old => light_task}/README.md | 0 .../requirements.txt | 0 .../{mnist_task_old => light_task}/task.py | 30 ++++++----- .../katib_minikube/mnist_task_old/Dockerfile | 15 ------ 8 files changed, 78 insertions(+), 44 deletions(-) create mode 100644 experiments/katib_minikube/light_task/Dockerfile rename experiments/katib_minikube/{mnist_task_old => light_task}/README.md (100%) rename experiments/katib_minikube/{mnist_task_old => light_task}/requirements.txt (100%) rename experiments/katib_minikube/{mnist_task_old => light_task}/task.py (65%) delete mode 100644 experiments/katib_minikube/mnist_task_old/Dockerfile diff --git a/experiments/katib_minikube/experiment_template.yaml b/experiments/katib_minikube/experiment_template.yaml index e733d32..1e719a9 100644 --- a/experiments/katib_minikube/experiment_template.yaml +++ b/experiments/katib_minikube/experiment_template.yaml @@ -39,7 +39,7 @@ spec: imagePullPolicy: IfNotPresent command: - "python3" - - "experiments/katib_minikube/mnist_task/task.py" + - "experiments/katib_minikube/$worker_image/task.py" - "--batch-size=64" - "--lr=$trialParameters" env: diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index df91f9a..0dc96de 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-22 + name: katib-study-80 spec: objective: type: maximize @@ -35,16 +35,16 @@ spec: spec: containers: - name: training-container - image: mnist_katib + image: light_task imagePullPolicy: IfNotPresent command: - "python3" - - "experiments/katib_minikube/mnist_task/task.py" + - "experiments/katib_minikube/light_task/task.py" - "--batch-size=64" - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-22" + value: "katib-study-80" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index f96ecea..1443ef4 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -159,7 +159,7 @@ def setup(self): self.image_builder = builder_from_string("minikube")() PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) res = self.image_builder.deploy_image( - "experiments/katib_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) + f'experiments/katib_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT) log.info(res) log.info(f"Image: {self.trial_tag}") @@ -223,7 +223,7 @@ def collect_run_results(self): deployed = 0 #TODO is there a way to run watch on namespaced custome object? - # for e in w.stream(c.get_namespaced_custom_object_with_http_info, + # for e in w.stream(c.get_namespaced_custom_object, # group=self.group, # version=self.version, # namespace=self.namespace, @@ -274,15 +274,36 @@ def test(self): return super().test() def undeploy(self): - log.info("Undeploying katib:") - res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() - log.info(res) + log.info("Deleteing the experiment crd from the cluster") + # res = os.popen('kubectl delete -f grid.yaml').read() + # log.info(res) + config.load_kube_config() + api = client.CustomObjectsApi() + try: + resource = api.delete_collection_namespaced_custom_object( + group=self.group, + version=self.version, + namespace=self.namespace, + plural=self.plural, + ) + log.info(resource) + # log.info(resource) + # log.info(resource["status"]) + except ApiException as e: + log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) + + + # log.info("Undeploying katib:") + # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() + # log.info(res) + + w = watch.Watch() c = client.CoreV1Api() - # log.info("Deleteing the namespace:") - # res = c.delete_namespace_with_http_info(name=self.namespace) + log.info("Deleteing the namespace:") + res = c.delete_namespace_with_http_info(name=self.namespace) log.info("Checking status of the namespace:") @@ -291,10 +312,15 @@ def undeploy(self): except ApiException as err: log.info(err) log.info("Namespace sucessfully deleted") + + log.info("Deleteing task docker image from minikube") + sleep(1) + self.image_builder.cleanup(self.trial_tag) log.info("Finished undeploying") return + log.info("Namespace still not termintated") #if the namespace was still existent we must wait till it is really terminated for e in w.stream(c.list_namespace): ob = e["object"] @@ -306,6 +332,10 @@ def undeploy(self): except ApiException as err: log.info(err) log.info("Namespace sucessfully deleted") + + log.info("Deleteing task docker image from minikube") + sleep(1) + self.image_builder.cleanup(self.trial_tag) w.stop() break @@ -322,14 +352,15 @@ def undeploy(self): # "dockerUserPassword":"", # "studyName":"" "jobs_Count":10, + "dockerImageTag":"light_task", "workerCount":10, "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), "generateNewDockerImage": True }) - # bench.deploy() - # bench.setup() - # bench.run() - # bench.collect_run_results() + bench.deploy() + bench.setup() + bench.run() + bench.collect_run_results() bench.undeploy() diff --git a/experiments/katib_minikube/light_task/Dockerfile b/experiments/katib_minikube/light_task/Dockerfile new file mode 100644 index 0000000..b2bd6aa --- /dev/null +++ b/experiments/katib_minikube/light_task/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9-slim + +RUN pip install pip --upgrade + +COPY experiments experiments +# COPY data data +COPY setup.py setup.py +COPY ml_benchmark ml_benchmark + + + +RUN pip install -e . + +CMD ["python", "experiments/katib_minikube/task_light/task.py"] \ No newline at end of file diff --git a/experiments/katib_minikube/mnist_task_old/README.md b/experiments/katib_minikube/light_task/README.md similarity index 100% rename from experiments/katib_minikube/mnist_task_old/README.md rename to experiments/katib_minikube/light_task/README.md diff --git a/experiments/katib_minikube/mnist_task_old/requirements.txt b/experiments/katib_minikube/light_task/requirements.txt similarity index 100% rename from experiments/katib_minikube/mnist_task_old/requirements.txt rename to experiments/katib_minikube/light_task/requirements.txt diff --git a/experiments/katib_minikube/mnist_task_old/task.py b/experiments/katib_minikube/light_task/task.py similarity index 65% rename from experiments/katib_minikube/mnist_task_old/task.py rename to experiments/katib_minikube/light_task/task.py index a8dcc0c..1101b71 100644 --- a/experiments/katib_minikube/mnist_task_old/task.py +++ b/experiments/katib_minikube/light_task/task.py @@ -1,26 +1,30 @@ from __future__ import print_function import argparse +import imp import logging import os import numpy as np import time +from ml_benchmark.latency_tracker import latency_decorator -def train(times ,epoch): +@latency_decorator +def train(times ,epochs): loss = 1 - for x in np.arange(1,times + 1): - loss = 1 - (x -1 ) / times - time.sleep(0.01) - msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( - epoch, x , times, - 100. * x / times, loss) - logging.info(msg) + for epoch in range(epochs): + for x in np.arange(1,times + 1): + loss = 1 - (x -1 ) / times + time.sleep(0.01) + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, x , times, + 100. * x / times, loss) + logging.info(msg) def test(): - test_accuracy =0.8 - logging.info("Validation-accuracy={:.4f}\n".format( + test_accuracy =0.7 + logging.info("precision={:.4f}\n".format( test_accuracy)) @@ -48,9 +52,9 @@ def main(): epochs = args.epochs batch_size = args.batch_size lr = args.lr - for epoch in np.arange(1,epochs+1): - train(batch_size,epoch) - test() + + train(batch_size,epochs) + test() diff --git a/experiments/katib_minikube/mnist_task_old/Dockerfile b/experiments/katib_minikube/mnist_task_old/Dockerfile deleted file mode 100644 index 1cbfd5b..0000000 --- a/experiments/katib_minikube/mnist_task_old/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3.9-slim - -ADD . /opt/mnist_task -WORKDIR /opt/mnist_task - -# Add folder for the logs. -RUN mkdir /katib -RUN pip install --no-cache-dir -r requirements.txt - -RUN chgrp -R 0 /opt/mnist_task \ - && chmod -R g+rwX /opt/mnist_task \ - && chgrp -R 0 /katib \ - && chmod -R g+rwX /katib - -ENTRYPOINT ["python3", "/opt/mnist_task/task.py"] From 2e38ed21b1be93ba1402d914bac21e1d7ef6093f Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 24 Aug 2022 13:33:37 +0200 Subject: [PATCH 7/9] fixes in deploy --- experiments/katib_minikube/grid.yaml | 4 +- experiments/katib_minikube/katib_benchmark.py | 115 ++++++++---------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index 0dc96de..d873f53 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-80 + name: katib-study-48 spec: objective: type: maximize @@ -44,7 +44,7 @@ spec: - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-80" + value: "katib-study-48" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index 1443ef4..c265eb3 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -34,6 +34,7 @@ def __init__(self, objective, grid, resources) -> None: config.load_kube_config() self.logging_level= self.resources.get("loggingLevel",log.INFO) + self.clean_up = self.resources.get("cleanUp",True) log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level) @@ -79,6 +80,8 @@ def deploy(self): on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods and services in kubernetes. """ + + log.info("Deploying katib:") res = os.popen('kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() log.info(res) @@ -106,17 +109,23 @@ def deploy(self): log.info(f'{ob.metadata.name} is ready') monitored_pods.remove(name) deployed = deployed + 1 + log.info(monitored_pods) - #if all monitored pods are running the deployment process was ended - if(deployed == 4 ): - w.stop() - log.info("Finished deploying crucial pods") + # Checking for status of cert generator + elif name == "katib-cert-generator" and ob.status.phase == "Succeeded": + log.info(f'{ob.metadata.name} is Succeeded') + monitored_pods.remove(name) + log.info(monitored_pods) + #if all monitored pods are running the deployment process was ended + if not monitored_pods: + w.stop() + log.info("Finished deploying crucial pods") - #TODO waiit untill all pods are running + @@ -188,7 +197,28 @@ def run(self): except ApiException as e: log.info("Exception when calling CustomObjectsApi->create_cluster_custom_object: %s\n" % e) - + + #Blocking untill the run is finished + #The GET /apis/{group}/{version}/namespaces/{namespace}/{plural}/{name} endpoint doesnt support watch argument. + + experiment = self.get_experiment() + while "status" not in experiment: + log.info("Waitinng for the status") + sleep(5) + experiment = self.get_experiment() + # log.info(experiment) + + + while experiment["status"]["conditions"][-1]["type"]!="Succeeded": + experiment = self.get_experiment() + succeeded = experiment["status"].get("trialsSucceeded",0) + + log.info(f'Status: {experiment["status"]["conditions"][-1]["reason"]} {succeeded} trials of {self.jobsCount} succeeded') + log.debug(experiment["status"]) + + sleep(2) + + def collect_benchmark_metrics(self): return super().collect_benchmark_metrics() @@ -209,65 +239,21 @@ def get_experiment(self): # log.info(resource) # log.info(resource["status"]) + return resource except ApiException as e: log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) - return resource + return "" def collect_run_results(self): log.info("Collecting run results:") - config.load_kube_config() - w = watch.Watch() - c = client.CustomObjectsApi() - deployed = 0 - - #TODO is there a way to run watch on namespaced custome object? - # for e in w.stream(c.get_namespaced_custom_object, - # group=self.group, - # version=self.version, - # namespace=self.namespace, - # name=self.study_name, - # plural=self.plural): - # experiment = e["object"] - # if "status" not in experiment: - # log.info("Waitinng for the status") - # sleep(5) - # elif experiment["status"]["conditions"][-1]["type"]!="Succeeded": - # log.info("chuja") - # if "trialsSucceeded" in experiment["status"]: - # log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') - # log.info(experiment["status"]["conditions"][-1]) - # sleep(2) - # else: - # log.info("\n Experiment finished with following optimal trial:") - # log.info(experiment["status"]["currentOptimalTrial"]) - - - - - - experiment = self.get_experiment() - while "status" not in experiment: - log.info("Waitinng for the status") - sleep(5) - experiment = self.get_experiment() - # log.info(experiment) - - - while experiment["status"]["conditions"][-1]["type"]!="Succeeded": - experiment = self.get_experiment() - log.info("\nWaiting for the experiment to finish:") - if "trialsSucceeded" in experiment["status"]: - log.info(f'{experiment["status"]["trialsSucceeded"]} trials of {self.jobsCount} succeeded') - log.info(experiment["status"]["conditions"][-1]) - sleep(2) - + experiment = self.get_experiment() - log.info("\n Experiment finished with following optimal trial:") - log.info(experiment["status"]["currentOptimalTrial"]) - + log.debug("\n Experiment finished with following optimal trial:") + log.debug(experiment["status"]["currentOptimalTrial"]) + return experiment["status"]["currentOptimalTrial"] def test(self): @@ -313,9 +299,10 @@ def undeploy(self): log.info(err) log.info("Namespace sucessfully deleted") - log.info("Deleteing task docker image from minikube") - sleep(1) - self.image_builder.cleanup(self.trial_tag) + if self.clean_up: + log.info("Deleteing task docker image from minikube") + sleep(2) + self.image_builder.cleanup(self.trial_tag) log.info("Finished undeploying") return @@ -333,9 +320,10 @@ def undeploy(self): log.info(err) log.info("Namespace sucessfully deleted") - log.info("Deleteing task docker image from minikube") - sleep(1) - self.image_builder.cleanup(self.trial_tag) + if self.clean_up: + log.info("Deleteing task docker image from minikube") + sleep(2) + self.image_builder.cleanup(self.trial_tag) w.stop() break @@ -355,7 +343,8 @@ def undeploy(self): "dockerImageTag":"light_task", "workerCount":10, "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), - "generateNewDockerImage": True + "generateNewDockerImage": True, + "cleanUp": True, }) bench.deploy() bench.setup() From a494587c07b13cd46b798d5bf8605af234cc7d15 Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 30 Aug 2022 17:35:03 +0200 Subject: [PATCH 8/9] 1. Clean up in katib benchmark. 2. fixed light_task. 3.Latency tracker prints out the exception witch acurres by latency recording --- .gitignore | 1 + experiments/katib_minikube/grid.yaml | 10 +- experiments/katib_minikube/katib_benchmark.py | 143 ++++++------------ .../katib_minikube/light_task/Dockerfile | 2 +- ml_benchmark/latency_tracker.py | 5 +- 5 files changed, 59 insertions(+), 102 deletions(-) diff --git a/.gitignore b/.gitignore index 89eb912..07d8d98 100644 --- a/.gitignore +++ b/.gitignore @@ -131,5 +131,6 @@ dmypy.json exp__* experiments/simple_raytune/benchmark__RaytuneBenchmark experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark +experiments/katib_minikube/benchmark__KatibBenchmark data/ \ No newline at end of file diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index d873f53..b9ae13c 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-48 + name: katib-study-50 spec: objective: type: maximize @@ -11,9 +11,9 @@ spec: objectiveMetricName: precision algorithm: algorithmName: grid - parallelTrialCount: 10 - maxTrialCount: 10 - maxFailedTrialCount: 10 + parallelTrialCount: 5 + maxTrialCount: 5 + maxFailedTrialCount: 5 parameters: - name: lr parameterType: double @@ -44,7 +44,7 @@ spec: - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-48" + value: "katib-study-50" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index c265eb3..b821e7f 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -1,5 +1,6 @@ from os import path import os +from shutil import ExecError import sys from time import sleep from urllib.request import urlopen @@ -20,60 +21,30 @@ class KatibBenchmark(Benchmark): - def __init__(self, objective, grid, resources) -> None: - self.objective = objective - self.grid = grid + def __init__(self, resources) -> None: self.resources = resources self.group="kubeflow.org" self.version="v1beta1" - self.namespace='kubeflow' + self.namespace="kubeflow" self.plural="experiments" self.experiment_file_name = "grid.yaml" self.metrics_ip = resources.get("metricsIP") self.generate_new_docker_image = resources.get("generateNewDockerImage",True) - config.load_kube_config() - - self.logging_level= self.resources.get("loggingLevel",log.INFO) self.clean_up = self.resources.get("cleanUp",True) + self.trial_tag = resources.get("dockerImageTag","mnist_katib") + self.study_name= resources.get("studyName",f'katib-study-{random.randint(0, 100)}') + self.workerCpu = resources.get("workerCpu",2) + self.workerMemory= resources.get("workerMemory",2) + self.workerCount = resources.get("workerCount",5) + self.jobsCount = resources.get("jobsCount",6) + self.logging_level= self.resources.get("loggingLevel",log.INFO) log.basicConfig(format='%(asctime)s Katib Benchmark %(levelname)s: %(message)s',level=self.logging_level) - - if "dockerImageTag" in self.resources: - self.trial_tag = self.resources["dockerImageTag"] - else: - self.trial_tag = "mnist_katib" - - - - - - if "studyName" in self.resources: - self.study_name = self.resources["studyName"] - else: - self.study_name = f"katib-study-{random.randint(0, 100)}" - - if "workerCpu" in self.resources: - self.workerCpu = self.resources["workerCpu"] - else: - self.workerCpu = 2 - - if "workerMemory" in resources: - self.workerMemory = self.resources["workerMemory"] - else: - self.workerMemory = 2 - if "workerCount" in resources: - self.workerCount = self.resources["workerCount"] - else: - self.workerCount = 5 - - if "jobs_Count" in resources: - self.jobsCount = self.resources["jobs_Count"] - else: - self.jobsCount = 6 + def deploy(self): """ With the completion of this step the desired architecture of the HPO Framework should be running @@ -109,13 +80,11 @@ def deploy(self): log.info(f'{ob.metadata.name} is ready') monitored_pods.remove(name) deployed = deployed + 1 - log.info(monitored_pods) # Checking for status of cert generator elif name == "katib-cert-generator" and ob.status.phase == "Succeeded": log.info(f'{ob.metadata.name} is Succeeded') monitored_pods.remove(name) - log.info(monitored_pods) #if all monitored pods are running the deployment process was ended @@ -200,7 +169,8 @@ def run(self): #Blocking untill the run is finished #The GET /apis/{group}/{version}/namespaces/{namespace}/{plural}/{name} endpoint doesnt support watch argument. - + + #TODO changing to watching of the jobs instead of the experiment crd? experiment = self.get_experiment() while "status" not in experiment: log.info("Waitinng for the status") @@ -237,8 +207,6 @@ def get_experiment(self): plural=self.plural, ) - # log.info(resource) - # log.info(resource["status"]) return resource except ApiException as e: log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) @@ -260,12 +228,12 @@ def test(self): return super().test() def undeploy(self): + log.info("Deleteing the experiment crd from the cluster") - # res = os.popen('kubectl delete -f grid.yaml').read() - # log.info(res) config.load_kube_config() api = client.CustomObjectsApi() try: + #deleting all experiment crds resource = api.delete_collection_namespaced_custom_object( group=self.group, version=self.version, @@ -273,59 +241,40 @@ def undeploy(self): plural=self.plural, ) log.info(resource) - # log.info(resource) - # log.info(resource["status"]) + except ApiException as e: log.info("Exception when calling CustomObjectsApi->get_namespaced_custom_object_status: %s\n" % e) - - - # log.info("Undeploying katib:") - # res = os.popen('kubectl delete -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"').read() - # log.info(res) - w = watch.Watch() c = client.CoreV1Api() log.info("Deleteing the namespace:") res = c.delete_namespace_with_http_info(name=self.namespace) - + log.info("Checking status of the namespace:") - try: - log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) - except ApiException as err: - log.info(err) - log.info("Namespace sucessfully deleted") - - if self.clean_up: - log.info("Deleteing task docker image from minikube") - sleep(2) - self.image_builder.cleanup(self.trial_tag) - log.info("Finished undeploying") - return - - - log.info("Namespace still not termintated") #if the namespace was still existent we must wait till it is really terminated for e in w.stream(c.list_namespace): ob = e["object"] # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found - #TODO do this in other way if ob.metadata.name == self.namespace: try: log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) except ApiException as err: - log.info(err) - log.info("Namespace sucessfully deleted") if self.clean_up: log.info("Deleteing task docker image from minikube") sleep(2) self.image_builder.cleanup(self.trial_tag) - w.stop() - break + + + if(err.status != 404): + raise Exception("Something went wrong",err) + else: + log.info("Namespace sucessfully deleted") + w.stop() + log.info("Finished undeploying") @@ -335,22 +284,28 @@ def undeploy(self): if __name__ == "__main__": #main() - bench = KatibBenchmark(1,1,resources={ - # "dockerUserLogin":"", - # "dockerUserPassword":"", - # "studyName":"" - "jobs_Count":10, - "dockerImageTag":"light_task", - "workerCount":10, - "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), - "generateNewDockerImage": True, - "cleanUp": True, - }) - bench.deploy() - bench.setup() - bench.run() - bench.collect_run_results() - bench.undeploy() - - + resources={ + # "dockerUserLogin":"", + # "dockerUserPassword":"", + # "studyName":"" + "jobsCount":5, + "dockerImageTag":"light_task", + "workerCount":5, + "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), + "generateNewDockerImage": True, + "cleanUp": True , + } + # bench = KatibBenchmark(resources=resources) + # bench.deploy() + # bench.setup() + # bench.run() + # bench.collect_run_results() + # bench.undeploy() + + + + from ml_benchmark.benchmark_runner import BenchmarkRunner + runner = BenchmarkRunner( + benchmark_cls=KatibBenchmark, resources=resources) + runner.run() \ No newline at end of file diff --git a/experiments/katib_minikube/light_task/Dockerfile b/experiments/katib_minikube/light_task/Dockerfile index b2bd6aa..8a05a7d 100644 --- a/experiments/katib_minikube/light_task/Dockerfile +++ b/experiments/katib_minikube/light_task/Dockerfile @@ -11,4 +11,4 @@ COPY ml_benchmark ml_benchmark RUN pip install -e . -CMD ["python", "experiments/katib_minikube/task_light/task.py"] \ No newline at end of file +CMD ["python", "experiments/katib_minikube/light_task/task.py"] \ No newline at end of file diff --git a/ml_benchmark/latency_tracker.py b/ml_benchmark/latency_tracker.py index 0f98094..f5929f8 100644 --- a/ml_benchmark/latency_tracker.py +++ b/ml_benchmark/latency_tracker.py @@ -61,8 +61,9 @@ def track(self, latency_obj): stmt = insert(latency).values(latency_obj.to_dict()) conn.execute(stmt) print(f"Latency Recorded! Latency: {latency_obj.to_dict()}") - except Exception: - print(f"Failed to record latency: {latency_obj.to_dict()}") + except Exception as e: + print(f"Failed to record latency: {latency_obj.to_dict()}",e) + def _get_connection_string(self): # XXX: list order is implicitly a priority From 45654579ecb5f3dad9d5c82a4ebe14066bebe5df Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 31 Aug 2022 13:47:28 +0200 Subject: [PATCH 9/9] 1.Changed the mnist_task base image to lighter one. 2.Katib_benchmark runs mnist_task per default. --- experiments/katib_minikube/grid.yaml | 14 +++++++------- experiments/katib_minikube/katib_benchmark.py | 9 ++++----- experiments/katib_minikube/mnist_task/Dockerfile | 9 ++++----- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/experiments/katib_minikube/grid.yaml b/experiments/katib_minikube/grid.yaml index b9ae13c..04c4f67 100644 --- a/experiments/katib_minikube/grid.yaml +++ b/experiments/katib_minikube/grid.yaml @@ -3,7 +3,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: katib-study-50 + name: katib-study-32 spec: objective: type: maximize @@ -11,9 +11,9 @@ spec: objectiveMetricName: precision algorithm: algorithmName: grid - parallelTrialCount: 5 - maxTrialCount: 5 - maxFailedTrialCount: 5 + parallelTrialCount: 2 + maxTrialCount: 2 + maxFailedTrialCount: 2 parameters: - name: lr parameterType: double @@ -35,16 +35,16 @@ spec: spec: containers: - name: training-container - image: light_task + image: mnist_task imagePullPolicy: IfNotPresent command: - "python3" - - "experiments/katib_minikube/light_task/task.py" + - "experiments/katib_minikube/mnist_task/task.py" - "--batch-size=64" - "--lr=${trialParameters.learningRate}" env: - name: STUDY_NAME - value: "katib-study-50" + value: "katib-study-32" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/katib_minikube/katib_benchmark.py b/experiments/katib_minikube/katib_benchmark.py index b821e7f..d4016b3 100644 --- a/experiments/katib_minikube/katib_benchmark.py +++ b/experiments/katib_minikube/katib_benchmark.py @@ -31,7 +31,7 @@ def __init__(self, resources) -> None: self.metrics_ip = resources.get("metricsIP") self.generate_new_docker_image = resources.get("generateNewDockerImage",True) self.clean_up = self.resources.get("cleanUp",True) - self.trial_tag = resources.get("dockerImageTag","mnist_katib") + self.trial_tag = resources.get("dockerImageTag","mnist_task") self.study_name= resources.get("studyName",f'katib-study-{random.randint(0, 100)}') self.workerCpu = resources.get("workerCpu",2) self.workerMemory= resources.get("workerMemory",2) @@ -66,7 +66,6 @@ def deploy(self): log.info("Waiting for all Katib pods to be ready:") # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments monitored_pods = ["katib-cert-generator","katib-db-manager","katib-mysql","katib-ui","katib-controller"] - # TODO changing to list_namespaced_deployments? for e in w.stream(c.list_namespaced_pod, namespace=self.namespace): ob = e["object"] @@ -289,9 +288,9 @@ def undeploy(self): # "dockerUserLogin":"", # "dockerUserPassword":"", # "studyName":"" - "jobsCount":5, - "dockerImageTag":"light_task", - "workerCount":5, + "jobsCount":2, + # "dockerImageTag":"light_task", + "workerCount":2, "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), "generateNewDockerImage": True, "cleanUp": True , diff --git a/experiments/katib_minikube/mnist_task/Dockerfile b/experiments/katib_minikube/mnist_task/Dockerfile index 7e62b3b..0415797 100644 --- a/experiments/katib_minikube/mnist_task/Dockerfile +++ b/experiments/katib_minikube/mnist_task/Dockerfile @@ -1,11 +1,10 @@ -#TODO change to some slimer base image -FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime -#FROM python:3.9-slim +FROM python:3.9.6-slim RUN pip install pip --upgrade +RUN pip install torch==1.10.2 torchvision==0.11.3 +RUN pip install optuna==2.10.1 -COPY experiments experiments -# COPY data data +COPY experiments/katib_minikube experiments/katib_minikube COPY setup.py setup.py COPY ml_benchmark ml_benchmark