From 4cf5c927ade51b72178379af5ba1664ff7ed89ed Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 2 Aug 2022 14:23:13 +0200 Subject: [PATCH 1/9] Redone standalone polyaxon script --- .../experiment_template.yaml | 21 ++ experiments/polyaxon_minikube/grid.yaml | 21 ++ .../polyaxon_minikube/mnist_task/Dockerfile | 15 + .../polyaxon_minikube/mnist_task/README.md | 11 + .../mnist_task/requirements.txt | 3 + .../polyaxon_minikube/mnist_task/task.py | 66 ++++ .../polyaxon_minikube/polyaxon_benchmark.py | 334 ++++++++++++++++++ 7 files changed, 471 insertions(+) create mode 100644 experiments/polyaxon_minikube/experiment_template.yaml create mode 100644 experiments/polyaxon_minikube/grid.yaml create mode 100644 experiments/polyaxon_minikube/mnist_task/Dockerfile create mode 100644 experiments/polyaxon_minikube/mnist_task/README.md create mode 100644 experiments/polyaxon_minikube/mnist_task/requirements.txt create mode 100644 experiments/polyaxon_minikube/mnist_task/task.py create mode 100644 experiments/polyaxon_minikube/polyaxon_benchmark.py diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml new file mode 100644 index 0000000..507a2a5 --- /dev/null +++ b/experiments/polyaxon_minikube/experiment_template.yaml @@ -0,0 +1,21 @@ +version: 1.1 +kind: operation +name: $study_name +matrix: + kind: grid + numRuns: $jobs_num + concurrency: $worker_num + params: + lr: + kind: logspace + value: 0.01:0.1:$jobs_num +component: + inputs: + - name: lr + type: float + run: + kind: job + container: + image: $worker_image + command: [python3, /opt/mnist_task/task.py] + args: ["--batch-size=64", "--lr={{ lr }}"] diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml new file mode 100644 index 0000000..a0e9625 --- /dev/null +++ b/experiments/polyaxon_minikube/grid.yaml @@ -0,0 +1,21 @@ +version: 1.1 +kind: operation +name: polyaxon-study-72 +matrix: + kind: grid + numRuns: 5 + concurrency: 5 + params: + lr: + kind: logspace + value: 0.01:0.1:5 +component: + inputs: + - name: lr + type: float + run: + kind: job + container: + image: witja46/mnist_polyaxon:latest + command: [python3, /opt/mnist_task/task.py] + args: ["--batch-size=64", "--lr={{ lr }}"] diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile new file mode 100644 index 0000000..1cbfd5b --- /dev/null +++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.9-slim + +ADD . /opt/mnist_task +WORKDIR /opt/mnist_task + +# Add folder for the logs. +RUN mkdir /katib +RUN pip install --no-cache-dir -r requirements.txt + +RUN chgrp -R 0 /opt/mnist_task \ + && chmod -R g+rwX /opt/mnist_task \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/mnist_task/task.py"] diff --git a/experiments/polyaxon_minikube/mnist_task/README.md b/experiments/polyaxon_minikube/mnist_task/README.md new file mode 100644 index 0000000..3326bc7 --- /dev/null +++ b/experiments/polyaxon_minikube/mnist_task/README.md @@ -0,0 +1,11 @@ +# PyTorch MNIST Image Classification Example + +This is PyTorch MNIST image classification training container with saving metrics +to the file or printing to the StdOut. It uses convolutional neural network to +train the model. + +Katib uses this training container in some Experiments, for instance in the +[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64), +the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62), +the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71) +and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54). diff --git a/experiments/polyaxon_minikube/mnist_task/requirements.txt b/experiments/polyaxon_minikube/mnist_task/requirements.txt new file mode 100644 index 0000000..95b0e5e --- /dev/null +++ b/experiments/polyaxon_minikube/mnist_task/requirements.txt @@ -0,0 +1,3 @@ +Pillow>=9.1.1 +numpy +polyaxon \ No newline at end of file diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py new file mode 100644 index 0000000..d18bae3 --- /dev/null +++ b/experiments/polyaxon_minikube/mnist_task/task.py @@ -0,0 +1,66 @@ +from __future__ import print_function + +from polyaxon import tracking +import argparse +import logging +import os +import numpy as np +import time + +def train(times ,epoch): + loss = 1 + for x in np.arange(1,times + 1): + loss = 1 - (x -1 ) / times + time.sleep(0.01) + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, x , times, + 100. * x / times, loss) + logging.info(msg) + + +def test(): + + test_accuracy =0.8 + logging.info("Validation-accuracy={:.4f}\n".format( + test_accuracy)) + + + +def main(): + + #Polyaxon + tracking.init() + + #parsing arguments + parser = argparse.ArgumentParser(description="MNIST Example") + parser.add_argument("--batch-size", type=int, default=64, metavar="N", + help="input batch size for training (default: 64)") + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--lr", type=float, default=0.01, metavar="LR", + help="learning rate (default: 0.01)") + args = parser.parse_args() + + #timestamps format + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG) + + + #model taining and testing + epochs = args.epochs + batch_size = args.batch_size + lr = args.lr + for epoch in np.arange(1,epochs+1): + train(batch_size,epoch) + test() + + + #polyaxon + tracking.log_metrics(met1=0.8, met2= 0.9) + tracking.log_outputs(loss=0.9,accuracy=0.6) + + +if __name__ == "__main__": + main() diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py new file mode 100644 index 0000000..0295e21 --- /dev/null +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -0,0 +1,334 @@ + +from asyncio import subprocess +from concurrent.futures import process +from itertools import count +import json +from os import path +import os +import sys +from time import sleep +import random +from kubernetes import client, config,watch +from kubernetes.client.rest import ApiException +from string import Template +import yaml +import docker +PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../..")) +sys.path.append(PROJECT_ROOT) +from ml_benchmark.benchmark_runner import Benchmark +import requests +import subprocess +import psutil +import logging as log + + + +class PolyaxonBenchmark(Benchmark): + + def __init__(self, objective, grid, resources) -> None: + self.objective = objective + self.grid = grid + self.resources = resources + self.group="kubeflow.org" + self.version="v1beta1" + self.namespace='polyaxon' + self.plural="experiments" + self.experiment_file_name = "grid.yaml" + self.project_description = "Somer random description" + self.polyaxon_addr="http://localhost:31833/" + + config.load_kube_config() + + + log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=log.INFO) + + + if "dockerImageTag" in self.resources: + self.trial_tag = self.resources["dockerImageTag"] + else: + self.trial_tag = "mnist_polyaxon:latest" + + if "dockerUserLogin" in self.resources: + self.docker_user = self.resources["dockerUserLogin"] + self.trial_tag = f'{self.docker_user}/{self.trial_tag}' + else: + self.docker_user = "" + self.trial_tag = "witja46/mnist_polyaxon:latest" + + if "dockerUserPassword" in self.resources: + self.docker_pasword = self.resources["dockerUserPassword"] + else: + self.docker_pasword = "" + + + if "studyName" in self.resources: + self.study_name = self.resources["studyName"] + else: + self.study_name = f"polyaxon-study-{random.randint(0, 100)}" + + if "workerCpu" in self.resources: + self.workerCpu = self.resources["workerCpu"] + else: + self.workerCpu = 2 + + if "workerMemory" in resources: + self.workerMemory = self.resources["workerMemory"] + else: + self.workerMemory = 2 + + if "workerCount" in resources: + self.workerCount = self.resources["workerCount"] + else: + self.workerCount = 5 + + if "jobsCount" in resources: + self.jobsCount = self.resources["jobsCount"] + else: + self.jobsCount = 6 + + def deploy(self): + """ + With the completion of this step the desired architecture of the HPO Framework should be running + on a platform, e.g,. in the case of Kubernetes it referes to the steps nassary to deploy all pods + and services in kubernetes. + """ + + log.info("Adding polyaxon to helm repo:") + res = os.popen('helm repo add polyaxon https://charts.polyaxon.com').read() + log.info(res) + + #TODO deploy via helm for better error handling and easier configuration? + log.info("Deploying polyaxon to minikube:") + res = os.popen('polyaxon admin deploy -t minikube').read() + log.info(res) + + + + + + config.load_kube_config() + w = watch.Watch() + c = client.CoreV1Api() + # c = client.AppsV1Api() + deployed = 0 + + log.info("Waiting for polyaxon pods to be read:") + monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] + # TODO changing to list_namespaced_deployments? + for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"): + ob = e["object"] + + for name in monitored_pods: + if name in ob.metadata.name: + # log.info(ob.metadata.name) + #log.info(ob) + + if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: + # if ob.status.ready_replicas is not None and ob.status.ready_replicas >= 1: + log.info(f'{ob.metadata.name} is ready') + monitored_pods.remove(name) + deployed = deployed + 1 + # log.info(ob) + if(deployed == 4 ): + w.stop() + log.info("Finished deploying") + + res = os.popen('kubectl get deployment -n polyaxon').read() + log.info(res) + + + + + + # Starting post forwarding to the polyaxon api in the background + log.info("Starting post-forward to polyaxon api:") + self.post_forward_process = subprocess.Popen("kubectl port-forward svc/polyaxon-polyaxon-api 31833:80 -n polyaxon",shell=True,stdout=subprocess.PIPE) + + + + + + + def setup(self): + """ + Every Operation that is needed before the actual optimization (trial) starts and that is not relevant + for starting up workers or the necessary architecture. + """ + + #creating experiment yaml + + experiment_definition = { + "worker_num": self.workerCount, + "jobs_num":self.jobsCount, + "worker_cpu": self.workerCpu, + "worker_mem": f"{self.workerMemory}Gi", + "worker_image": self.trial_tag, + "study_name": self.study_name, + "trialParameters":"${trialParameters.learningRate}" + } + + #loading and filling the template + with open(path.join(path.dirname(__file__), "experiment_template.yaml"), "r") as f: + job_template = Template(f.read()) + job_yml_objects = job_template.substitute(experiment_definition) + + #writing the experiment definition into the file + with open(path.join(path.dirname(__file__), self.experiment_file_name), "w") as f: + f.write(job_yml_objects) + log.info("Experiment yaml created") + + + # Creating new docker image if credentials were passed + if "dockerUserLogin" in self.resources: + + #creating task docker image + log.info("Creating task docker image") + self.client = docker.client.from_env() + image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag) + log.info(f"Image: {self.trial_tag}") + for line in logs : + log.info(line) + + #pushing to repo + self.client.login(username=self.docker_user, password=self.docker_pasword) + for line in self.client.images.push(self.trial_tag, stream=True, decode=True): + log.info(line) + + + + def run(self): + + #TODO add error handling + log.info("Creating new project:") + project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, "description": self.project_description}) + log.info(project.text) + + + #TODO find out where to pass the --eager flag so that the run can be created with help of api + # with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f: + # body = f.read() + # log.info(body) + # run = requests.post(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs',json={"content":body,"eager":True ,"tags":["--eager"],"meta_info":{"eager":True,"--eager":True,"flags":"--eager"}}) + # log.info(run.text) + + + log.info("Starting polyaxon experiment:") + res = os.popen(f'polyaxon run -f ./{self.experiment_file_name} --project {self.study_name} --eager').read() + log.info(res) + + + #TODO change to kubernetes api for monitoring runing trials + log.info("Waiting for the run to finish:") + finished = False + while not finished: + runs = self.get_succeeded_runs() + log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded') + + #checking if all runs were finished + finished = runs["count"] == self.jobsCount + sleep(1) + return + + + def collect_benchmark_metrics(self): + pass + + + + def get_succeeded_runs(self, sort_by="duration"): + + #TODO add error handling acording to polyaxon api + res = requests.get(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs?query=status:succeeded&sort={sort_by}') + result = json.loads(res.text) + return result + + def collect_run_results(self): + + + log.info("Collecting run results:") + result = self.get_succeeded_runs() + log.info(json.dumps(result,indent=4)) + + # log.info("\n Experiment finished with following optimal trial:") + # log.info(result["results"][0]) + return result["results"] + + def test(self): + return super().test() + + def undeploy(self): + + log.info("Terminating post forwarding process:") + process = psutil.Process(self.post_forward_process.pid) + for proc in process.children(recursive=True): + proc.kill() + process.kill() + + + + #TODO changing to undeploying with helm? + log.info("Undeploying polyaxon:") + p = subprocess.Popen(["polyaxon", "admin" ,"teardown"],shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE) + out,err = p.communicate(input=b"y") + p.terminate() + + + + + # Waiting untill all polyaxon pods get terminated + config.load_kube_config() + w = watch.Watch() + c = client.CoreV1Api() + deployed = 0 + log.info("Waiting for polyaxon pods to be terminated:") + for e in w.stream(c.list_namespaced_pod, namespace=self.namespace): + ob = e["object"] + + log.debug(f'{deployed} pods out of 4 were killed') + log.debug("\n new in stream:\n") + log.debug(ob.metadata.name,ob.status.phase) + + if not ob.status.container_statuses[0].ready: + log.info(f'Containers of {ob.metadata.name} are terminated') + deployed = deployed + 1 + if(deployed == 4 ): + w.stop() + # log.info("Finished ") + break + + + log.info("Killed all pods deleteing the namespace:") + res = c.delete_namespace_with_http_info(name=self.namespace) + + + #TODO somehow handel the timouts? + log.info("Checking status of the deleted namespace:") + for e in w.stream(c.list_namespace): + ob = e["object"] + # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found + #TODO do this in other way + if ob.metadata.name == self.namespace: + try: + log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) + except ApiException as err: + log.info(err) + w.stop() + break + + log.info("Finished undeploying") + + +if __name__ == "__main__": + #main() + bench = PolyaxonBenchmark(1,1,resources={ + # "dockerUserLogin":"", + # "dockerUserPassword":"", + # "studyName":"" + "jobsCount":5, + "workerCount":5 + }) + bench.deploy() + bench.setup() + bench.run() + bench.collect_run_results() + bench.undeploy() From df16ec56d67e7c6a8869b8e87c35f4fbeea81487 Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 3 Aug 2022 13:18:12 +0200 Subject: [PATCH 2/9] Added some comments, default logging level changed to critical --- experiments/polyaxon_minikube/grid.yaml | 2 +- .../polyaxon_minikube/mnist_task/task.py | 16 ++++++++-- .../polyaxon_minikube/polyaxon_benchmark.py | 30 ++++++++++--------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index a0e9625..7a681ad 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-72 +name: polyaxon-study-88 matrix: kind: grid numRuns: 5 diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py index d18bae3..aa6eb13 100644 --- a/experiments/polyaxon_minikube/mnist_task/task.py +++ b/experiments/polyaxon_minikube/mnist_task/task.py @@ -1,11 +1,15 @@ -from __future__ import print_function - from polyaxon import tracking import argparse import logging import os import numpy as np import time +import sys +PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../..")) +sys.path.append(PROJECT_ROOT) +from ml_benchmark.benchmark_runner import Benchmark +from ml_benchmark.workload.mnist.mnist_task import MnistTask + def train(times ,epoch): loss = 1 @@ -56,6 +60,14 @@ def main(): train(batch_size,epoch) test() + #TODO swicht to the real task + # task = MnistTask(config_init={"epochs": 1}) + # objective = task.create_objective() + + # objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01}) + # objective.train() + # validation_scores = objective.validate() + # print(validation_scores) #polyaxon tracking.log_metrics(met1=0.8, met2= 0.9) diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index 0295e21..c1089cd 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -36,11 +36,12 @@ def __init__(self, objective, grid, resources) -> None: self.experiment_file_name = "grid.yaml" self.project_description = "Somer random description" self.polyaxon_addr="http://localhost:31833/" - + config.load_kube_config() - - log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=log.INFO) + self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) + + log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=self.logging_level) if "dockerImageTag" in self.resources: @@ -112,29 +113,29 @@ def deploy(self): # c = client.AppsV1Api() deployed = 0 + log.info("Waiting for polyaxon pods to be read:") + # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] # TODO changing to list_namespaced_deployments? for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"): ob = e["object"] for name in monitored_pods: - if name in ob.metadata.name: - # log.info(ob.metadata.name) - #log.info(ob) + #checking if it is one of the pods that we want to monitor + if name in ob.metadata.name: + + # Checking if the pod already is runnig and its underlying containers are ready if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: - # if ob.status.ready_replicas is not None and ob.status.ready_replicas >= 1: log.info(f'{ob.metadata.name} is ready') monitored_pods.remove(name) deployed = deployed + 1 - # log.info(ob) + + #if all monitored pods are running the deployment process was ended if(deployed == 4 ): w.stop() - log.info("Finished deploying") - - res = os.popen('kubectl get deployment -n polyaxon').read() - log.info(res) + log.info("Finished deploying crucial pods") @@ -217,7 +218,7 @@ def run(self): log.info(res) - #TODO change to kubernetes api for monitoring runing trials + #TODO switch to kubernetes api for monitoring runing trials log.info("Waiting for the run to finish:") finished = False while not finished: @@ -325,7 +326,8 @@ def undeploy(self): # "dockerUserPassword":"", # "studyName":"" "jobsCount":5, - "workerCount":5 + "workerCount":5, + "loggingLevel":log.INFO }) bench.deploy() bench.setup() From 6def8e90810b4115e7801d5ed8934642e40768b9 Mon Sep 17 00:00:00 2001 From: witja46 Date: Thu, 4 Aug 2022 14:25:25 +0200 Subject: [PATCH 3/9] Polyaxon now runs with help of BenchmarkRunner, changed logging style, defualt logginglevel == INFO --- ...mark_results__2022-08-04_13-00-24__id.json | 1 + ...mark_results__2022-08-04_13-06-41__id.json | 1 + ...mark_results__2022-08-04_14-06-58__id.json | 1 + experiments/polyaxon_minikube/grid.yaml | 2 +- .../polyaxon_minikube/mnist_task/task.py | 13 ++-- .../polyaxon_minikube/polyaxon_benchmark.py | 62 +++++++++++++------ ml_benchmark/__init__.py | 2 +- 7 files changed, 53 insertions(+), 29 deletions(-) create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json new file mode 100644 index 0000000..e898192 --- /dev/null +++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json @@ -0,0 +1 @@ +{"benchmark_metrics": {"latency": [{"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_145644815288", "function_name": "deploy", "start_time": "2022-08-04 13:00:27.963286", "end_time": "2022-08-04 13:02:02.040017", "duration_sec": 94.076731}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_145644815288", "function_name": "setup", "start_time": "2022-08-04 13:02:02.103506", "end_time": "2022-08-04 13:02:02.108233", "duration_sec": 0.004727}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_145644815288", "function_name": "run", "start_time": "2022-08-04 13:02:02.131942", "end_time": "2022-08-04 13:02:25.957963", "duration_sec": 23.826021}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_145644815288", "function_name": "collect_run_results", "start_time": "2022-08-04 13:02:25.981388", "end_time": "2022-08-04 13:02:25.994767", "duration_sec": 0.013379}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_145644815288", "function_name": "test", "start_time": "2022-08-04 13:02:26.015907", "end_time": "2022-08-04 13:02:26.015907", "duration_sec": 0.0}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_145644815288", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:02:26.035917", "end_time": "2022-08-04 13:02:26.035917", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json new file mode 100644 index 0000000..b14a3fa --- /dev/null +++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json @@ -0,0 +1 @@ +{"benchmark_metrics": {"latency": [{"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_133265314672", "function_name": "deploy", "start_time": "2022-08-04 13:06:45.483067", "end_time": "2022-08-04 13:08:15.610367", "duration_sec": 90.1273}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_133265314672", "function_name": "setup", "start_time": "2022-08-04 13:08:15.663572", "end_time": "2022-08-04 13:08:15.664632", "duration_sec": 0.00106}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_133265314672", "function_name": "run", "start_time": "2022-08-04 13:08:15.687857", "end_time": "2022-08-04 13:08:39.550174", "duration_sec": 23.862317}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_133265314672", "function_name": "collect_run_results", "start_time": "2022-08-04 13:08:39.576174", "end_time": "2022-08-04 13:08:39.588927", "duration_sec": 0.012753}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_133265314672", "function_name": "test", "start_time": "2022-08-04 13:08:39.611043", "end_time": "2022-08-04 13:08:39.611043", "duration_sec": 0.0}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_133265314672", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:08:39.633044", "end_time": "2022-08-04 13:08:39.633044", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 50}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json new file mode 100644 index 0000000..2de26d8 --- /dev/null +++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json @@ -0,0 +1 @@ +{"benchmark_metrics": {"latency": [{"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_125399655361", "function_name": "deploy", "start_time": "2022-08-04 14:07:02.174784", "end_time": "2022-08-04 14:08:31.542018", "duration_sec": 89.367234}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_125399655361", "function_name": "setup", "start_time": "2022-08-04 14:08:31.588224", "end_time": "2022-08-04 14:08:36.247162", "duration_sec": 4.658938}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_125399655361", "function_name": "run", "start_time": "2022-08-04 14:08:36.268226", "end_time": "2022-08-04 14:08:59.486441", "duration_sec": 23.218215}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_125399655361", "function_name": "collect_run_results", "start_time": "2022-08-04 14:08:59.507670", "end_time": "2022-08-04 14:08:59.521064", "duration_sec": 0.013394}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_125399655361", "function_name": "test", "start_time": "2022-08-04 14:08:59.543152", "end_time": "2022-08-04 14:08:59.543152", "duration_sec": 0.0}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_125399655361", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:08:59.562769", "end_time": "2022-08-04 14:08:59.562769", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"dockerUserLogin": "witja46", "dockerUserPassword": "J$rmakowicz1998", "jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 7a681ad..70d540d 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-88 +name: polyaxon-study-15 matrix: kind: grid numRuns: 5 diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py index aa6eb13..7fd4d46 100644 --- a/experiments/polyaxon_minikube/mnist_task/task.py +++ b/experiments/polyaxon_minikube/mnist_task/task.py @@ -1,14 +1,13 @@ from polyaxon import tracking import argparse import logging -import os import numpy as np import time -import sys -PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../..")) -sys.path.append(PROJECT_ROOT) -from ml_benchmark.benchmark_runner import Benchmark -from ml_benchmark.workload.mnist.mnist_task import MnistTask +# import os +# PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../..")) +# sys.path.append(PROJECT_ROOT) +# from ml_benchmark.benchmark_runner import Benchmark +# from ml_benchmark.workload.mnist.mnist_task import MnistTask def train(times ,epoch): @@ -60,7 +59,7 @@ def main(): train(batch_size,epoch) test() - #TODO swicht to the real task + # #TODO swicht to the real task # task = MnistTask(config_init={"epochs": 1}) # objective = task.create_objective() diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index c1089cd..d8e92ac 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -5,16 +5,14 @@ import json from os import path import os +from socket import timeout import sys from time import sleep import random from kubernetes import client, config,watch from kubernetes.client.rest import ApiException from string import Template -import yaml import docker -PROJECT_ROOT = path.abspath(path.join(__file__ ,"../../..")) -sys.path.append(PROJECT_ROOT) from ml_benchmark.benchmark_runner import Benchmark import requests import subprocess @@ -25,9 +23,9 @@ class PolyaxonBenchmark(Benchmark): - def __init__(self, objective, grid, resources) -> None: - self.objective = objective - self.grid = grid + def __init__(self, resources) -> None: + # self.objective = objective + # self.grid = grid self.resources = resources self.group="kubeflow.org" self.version="v1beta1" @@ -36,12 +34,13 @@ def __init__(self, objective, grid, resources) -> None: self.experiment_file_name = "grid.yaml" self.project_description = "Somer random description" self.polyaxon_addr="http://localhost:31833/" + self.post_forward_process=False config.load_kube_config() self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) - log.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',level=self.logging_level) + log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level) if "dockerImageTag" in self.resources: @@ -190,11 +189,14 @@ def setup(self): for line in logs : log.info(line) + + #TODO check if the image is runing properly and there is no problems with it + #pushing to repo self.client.login(username=self.docker_user, password=self.docker_pasword) for line in self.client.images.push(self.trial_tag, stream=True, decode=True): log.info(line) - + def run(self): @@ -218,6 +220,8 @@ def run(self): log.info(res) + #TODO check if there is no problem with the image + #TODO switch to kubernetes api for monitoring runing trials log.info("Waiting for the run to finish:") finished = False @@ -259,11 +263,12 @@ def test(self): def undeploy(self): - log.info("Terminating post forwarding process:") - process = psutil.Process(self.post_forward_process.pid) - for proc in process.children(recursive=True): - proc.kill() - process.kill() + if(self.post_forward_process): + log.info("Terminating post forwarding process:") + process = psutil.Process(self.post_forward_process.pid) + for proc in process.children(recursive=True): + proc.kill() + process.kill() @@ -308,11 +313,14 @@ def undeploy(self): ob = e["object"] # if the status of our namespace was changed we check if it the namespace was really removed from the cluster by requesting and expecting it to be not found #TODO do this in other way + + if ob.metadata.name == self.namespace: try: log.debug(c.read_namespace_status_with_http_info(name=self.namespace)) except ApiException as err: log.info(err) + log.info("Namespace sucessfully deleted") w.stop() break @@ -321,16 +329,30 @@ def undeploy(self): if __name__ == "__main__": #main() - bench = PolyaxonBenchmark(1,1,resources={ + # bench = PolyaxonBenchmark(resources={ + # "dockerUserLogin":"", + # "dockerUserPassword":"", + # # "studyName":"" + # "jobsCount":5, + # "workerCount":5, + # "loggingLevel":log.INFO + # }) + # bench.deploy() + # bench.setup() + # bench.run() + # bench.collect_run_results() + # bench.undeploy() + + resources={ # "dockerUserLogin":"", # "dockerUserPassword":"", # "studyName":"" "jobsCount":5, "workerCount":5, "loggingLevel":log.INFO - }) - bench.deploy() - bench.setup() - bench.run() - bench.collect_run_results() - bench.undeploy() + } + from ml_benchmark.benchmark_runner import BenchmarkRunner + runner = BenchmarkRunner( + benchmark_cls=PolyaxonBenchmark, resources=resources) + runner.run() + diff --git a/ml_benchmark/__init__.py b/ml_benchmark/__init__.py index a1abe42..242acda 100644 --- a/ml_benchmark/__init__.py +++ b/ml_benchmark/__init__.py @@ -1,7 +1,7 @@ __version__ = "develop" install_requires = [ "scikit-learn==0.24.2", - "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==5.0.3", + "tqdm==4.62.3", "SQLAlchemy==1.4.31", "docker==4.2.2", "psycopg2-binary"], test_install_requires = ["pytest==7.1.2", "pytest-cov==3.0.0"] URL = "https://github.com/gebauerm/ml_benchmark" From a66fad620ce22100c9c45741b27868c35f56e46b Mon Sep 17 00:00:00 2001 From: witja46 Date: Thu, 4 Aug 2022 14:40:13 +0200 Subject: [PATCH 4/9] Rerun --- .../benchmark_results__2022-08-04_14-28-03__id.json | 1 + experiments/polyaxon_minikube/grid.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json new file mode 100644 index 0000000..a9558b1 --- /dev/null +++ b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json @@ -0,0 +1 @@ +{"benchmark_metrics": {"latency": [{"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_115109159852", "function_name": "deploy", "start_time": "2022-08-04 14:28:05.734421", "end_time": "2022-08-04 14:29:23.529874", "duration_sec": 77.795453}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_115109159852", "function_name": "setup", "start_time": "2022-08-04 14:29:23.580757", "end_time": "2022-08-04 14:29:23.581278", "duration_sec": 0.000521}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_115109159852", "function_name": "run", "start_time": "2022-08-04 14:29:23.609547", "end_time": "2022-08-04 14:29:48.600260", "duration_sec": 24.990713}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_115109159852", "function_name": "collect_run_results", "start_time": "2022-08-04 14:29:48.624975", "end_time": "2022-08-04 14:29:48.638567", "duration_sec": 0.013592}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_115109159852", "function_name": "test", "start_time": "2022-08-04 14:29:48.661305", "end_time": "2022-08-04 14:29:48.661305", "duration_sec": 0.0}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_115109159852", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:29:48.682667", "end_time": "2022-08-04 14:29:48.682667", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 70d540d..00f6adf 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-15 +name: polyaxon-study-29 matrix: kind: grid numRuns: 5 From 36f05eacd2b889ee732d77074e18e730c22b14ef Mon Sep 17 00:00:00 2001 From: witja46 Date: Wed, 10 Aug 2022 11:37:45 +0200 Subject: [PATCH 5/9] Mnist task docker image is now generated from polyaxon_benchmark --- .gitignore | 1 + .../experiment_template.yaml | 9 ++- experiments/polyaxon_minikube/grid.yaml | 13 +++- .../polyaxon_minikube/mnist_task/Dockerfile | 23 +++--- .../polyaxon_minikube/mnist_task/task.py | 73 +++++++++++-------- .../polyaxon_minikube/polyaxon_benchmark.py | 40 ++++++---- .../polyaxon_minikube/requirements.txt | 1 + 7 files changed, 101 insertions(+), 59 deletions(-) create mode 100644 experiments/polyaxon_minikube/requirements.txt diff --git a/.gitignore b/.gitignore index 89eb912..757c4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -131,5 +131,6 @@ dmypy.json exp__* experiments/simple_raytune/benchmark__RaytuneBenchmark experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark +experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark data/ \ No newline at end of file diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml index 507a2a5..115774d 100644 --- a/experiments/polyaxon_minikube/experiment_template.yaml +++ b/experiments/polyaxon_minikube/experiment_template.yaml @@ -17,5 +17,12 @@ component: kind: job container: image: $worker_image - command: [python3, /opt/mnist_task/task.py] + command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] + env: + - name: STUDY_NAME + value: "$study_name" + - name: DB_CONN + value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" + - name: "METRICS_STORAGE_HOST" + value: "$metrics_ip" diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 00f6adf..a636cd3 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-29 +name: polyaxon-study-3 matrix: kind: grid numRuns: 5 @@ -16,6 +16,13 @@ component: run: kind: job container: - image: witja46/mnist_polyaxon:latest - command: [python3, /opt/mnist_task/task.py] + image: witja46/mnist_test + command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] + env: + - name: STUDY_NAME + value: "polyaxon-study-3" + - name: DB_CONN + value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" + - name: "METRICS_STORAGE_HOST" + value: "134.101.4.161" diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile index 1cbfd5b..5ed393d 100644 --- a/experiments/polyaxon_minikube/mnist_task/Dockerfile +++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile @@ -1,15 +1,16 @@ -FROM python:3.9-slim +#TODO change to some slimer base image +FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime +#FROM python:3.9-slim -ADD . /opt/mnist_task -WORKDIR /opt/mnist_task +RUN pip install pip --upgrade +RUN pip install polyaxon -# Add folder for the logs. -RUN mkdir /katib -RUN pip install --no-cache-dir -r requirements.txt +COPY experiments experiments +COPY data data +COPY setup.py setup.py +COPY ml_benchmark ml_benchmark -RUN chgrp -R 0 /opt/mnist_task \ - && chmod -R g+rwX /opt/mnist_task \ - && chgrp -R 0 /katib \ - && chmod -R g+rwX /katib -ENTRYPOINT ["python3", "/opt/mnist_task/task.py"] +RUN pip install -e . + +CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"] diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py index 7fd4d46..3763745 100644 --- a/experiments/polyaxon_minikube/mnist_task/task.py +++ b/experiments/polyaxon_minikube/mnist_task/task.py @@ -3,11 +3,11 @@ import logging import numpy as np import time -# import os -# PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../..")) -# sys.path.append(PROJECT_ROOT) -# from ml_benchmark.benchmark_runner import Benchmark -# from ml_benchmark.workload.mnist.mnist_task import MnistTask +import os +import sys +PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../")) +sys.path.append(PROJECT_ROOT) +from ml_benchmark.workload.mnist.mnist_task import MnistTask def train(times ,epoch): @@ -30,9 +30,7 @@ def test(): def main(): - - #Polyaxon - tracking.init() + #parsing arguments parser = argparse.ArgumentParser(description="MNIST Example") @@ -43,34 +41,47 @@ def main(): parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)") args = parser.parse_args() - - #timestamps format - logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - datefmt="%Y-%m-%dT%H:%M:%SZ", - level=logging.DEBUG) - - - #model taining and testing epochs = args.epochs batch_size = args.batch_size lr = args.lr - for epoch in np.arange(1,epochs+1): - train(batch_size,epoch) - test() - # #TODO swicht to the real task - # task = MnistTask(config_init={"epochs": 1}) - # objective = task.create_objective() + + + #TODO delete logging? + #Some logging + # timestamps format + # logging.basicConfig( + # format="%(asctime)s %(levelname)-8s %(message)s", + # datefmt="%Y-%m-%dT%H:%M:%SZ", + # level=logging.DEBUG) + # for epoch in np.arange(1,epochs+1): + # train(batch_size,epoch) + # test() + + + + #MnistTask + task = MnistTask(config_init={"epochs": 1}) + objective = task.create_objective() + #TODO add the weight decay to the definition of the template + objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01}) + objective.train() + validation_scores = objective.validate() - # objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01}) - # objective.train() - # validation_scores = objective.validate() - # print(validation_scores) - - #polyaxon - tracking.log_metrics(met1=0.8, met2= 0.9) - tracking.log_outputs(loss=0.9,accuracy=0.6) + + #Geting results + avg = validation_scores["weighted avg"] + print("precision",avg["precision"]) + print("f1-score",avg["f1-score"]) + + + # Polyaxon + #initiating polyaxon tracking + tracking.init() + #loging metrics + tracking.log_metrics(recall=avg["recall"], ) + #logging the results + tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"]) if __name__ == "__main__": diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index d8e92ac..4bc1eb8 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -1,5 +1,6 @@ from asyncio import subprocess +from base64 import decode from concurrent.futures import process from itertools import count import json @@ -9,6 +10,7 @@ import sys from time import sleep import random +from urllib.request import urlopen from kubernetes import client, config,watch from kubernetes.client.rest import ApiException from string import Template @@ -42,18 +44,20 @@ def __init__(self, resources) -> None: log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level) + + self.metrics_ip = resources.get("metricsIP") if "dockerImageTag" in self.resources: self.trial_tag = self.resources["dockerImageTag"] else: - self.trial_tag = "mnist_polyaxon:latest" + self.trial_tag = "mnist_test" if "dockerUserLogin" in self.resources: self.docker_user = self.resources["dockerUserLogin"] - self.trial_tag = f'{self.docker_user}/{self.trial_tag}' else: - self.docker_user = "" - self.trial_tag = "witja46/mnist_polyaxon:latest" + self.docker_user = "witja46" + + self.trial_tag = f'{self.docker_user}/{self.trial_tag}' if "dockerUserPassword" in self.resources: self.docker_pasword = self.resources["dockerUserPassword"] @@ -164,7 +168,8 @@ def setup(self): "worker_mem": f"{self.workerMemory}Gi", "worker_image": self.trial_tag, "study_name": self.study_name, - "trialParameters":"${trialParameters.learningRate}" + "trialParameters":"${trialParameters.learningRate}", + "metrics_ip": self.metrics_ip, } #loading and filling the template @@ -182,9 +187,11 @@ def setup(self): if "dockerUserLogin" in self.resources: #creating task docker image - log.info("Creating task docker image") + log.info("Creating task docker image") + #TODO do all this inside of minikube instead of on the client? self.client = docker.client.from_env() - image, logs = self.client.images.build(path="./mnist_task",tag=self.trial_tag) + PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) + image, logs = self.client.images.build(path=PROJECT_ROOT, dockerfile="experiments/polyaxon_minikube/mnist_task/Dockerfile" ,tag=self.trial_tag) log.info(f"Image: {self.trial_tag}") for line in logs : log.info(line) @@ -194,7 +201,10 @@ def setup(self): #pushing to repo self.client.login(username=self.docker_user, password=self.docker_pasword) - for line in self.client.images.push(self.trial_tag, stream=True, decode=True): + + log.info("Pushing image to the Dockerhub") + + for line in self.client.api.push(self.trial_tag,stream=True,decode=True): log.info(line) @@ -282,6 +292,7 @@ def undeploy(self): # Waiting untill all polyaxon pods get terminated + #TODO add logic in case of no existent polyaxon deployment config.load_kube_config() w = watch.Watch() c = client.CoreV1Api() @@ -337,22 +348,25 @@ def undeploy(self): # "workerCount":5, # "loggingLevel":log.INFO # }) - # bench.deploy() + #bench.deploy() # bench.setup() # bench.run() # bench.collect_run_results() - # bench.undeploy() + # bench.undeploy() resources={ - # "dockerUserLogin":"", - # "dockerUserPassword":"", + # "dockerUserLogin":"", + # "dockerUserPassword":"", # "studyName":"" + "dockerImageTag":"mnist_test", "jobsCount":5, "workerCount":5, - "loggingLevel":log.INFO + "loggingLevel":log.INFO, + "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() } from ml_benchmark.benchmark_runner import BenchmarkRunner runner = BenchmarkRunner( benchmark_cls=PolyaxonBenchmark, resources=resources) runner.run() + diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt new file mode 100644 index 0000000..4e272bb --- /dev/null +++ b/experiments/polyaxon_minikube/requirements.txt @@ -0,0 +1 @@ +polyaxon From f791be277e4ed93646b8eac1576eac5cea892a32 Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 16 Aug 2022 16:52:31 +0200 Subject: [PATCH 6/9] 1. Switched from os.popen to invoking polyaxon CLI commands with clicks CliRunner. 2. Docker image gets build inside of the minikube 3. Double checked that job containers get the ip Address of the host. Still the latencies of those do not get recorded. --- ...mark_results__2022-08-04_13-00-24__id.json | 1 - ...mark_results__2022-08-04_13-06-41__id.json | 1 - ...mark_results__2022-08-04_14-06-58__id.json | 1 - ...mark_results__2022-08-04_14-28-03__id.json | 1 - .../experiment_template.yaml | 1 + experiments/polyaxon_minikube/grid.yaml | 7 +- .../polyaxon_minikube/mnist_task/Dockerfile | 1 + .../polyaxon_minikube/mnist_task/task.py | 21 ++- .../polyaxon_minikube/polyaxon_benchmark.py | 148 +++++++++++------- .../polyaxon_minikube/requirements.txt | 1 + 10 files changed, 108 insertions(+), 75 deletions(-) delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json delete mode 100644 experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json deleted file mode 100644 index e898192..0000000 --- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-00-24__id.json +++ /dev/null @@ -1 +0,0 @@ -{"benchmark_metrics": {"latency": [{"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_145644815288", "function_name": "deploy", "start_time": "2022-08-04 13:00:27.963286", "end_time": "2022-08-04 13:02:02.040017", "duration_sec": 94.076731}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_145644815288", "function_name": "setup", "start_time": "2022-08-04 13:02:02.103506", "end_time": "2022-08-04 13:02:02.108233", "duration_sec": 0.004727}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_145644815288", "function_name": "run", "start_time": "2022-08-04 13:02:02.131942", "end_time": "2022-08-04 13:02:25.957963", "duration_sec": 23.826021}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_145644815288", "function_name": "collect_run_results", "start_time": "2022-08-04 13:02:25.981388", "end_time": "2022-08-04 13:02:25.994767", "duration_sec": 0.013379}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_145644815288", "function_name": "test", "start_time": "2022-08-04 13:02:26.015907", "end_time": "2022-08-04 13:02:26.015907", "duration_sec": 0.0}, {"metric_id": "id_16e78bd6-d5c3-4c94-8caa-545676516743__pid_17104__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_145644815288", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:02:26.035917", "end_time": "2022-08-04 13:02:26.035917", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json deleted file mode 100644 index b14a3fa..0000000 --- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_13-06-41__id.json +++ /dev/null @@ -1 +0,0 @@ -{"benchmark_metrics": {"latency": [{"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_133265314672", "function_name": "deploy", "start_time": "2022-08-04 13:06:45.483067", "end_time": "2022-08-04 13:08:15.610367", "duration_sec": 90.1273}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_133265314672", "function_name": "setup", "start_time": "2022-08-04 13:08:15.663572", "end_time": "2022-08-04 13:08:15.664632", "duration_sec": 0.00106}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_133265314672", "function_name": "run", "start_time": "2022-08-04 13:08:15.687857", "end_time": "2022-08-04 13:08:39.550174", "duration_sec": 23.862317}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_133265314672", "function_name": "collect_run_results", "start_time": "2022-08-04 13:08:39.576174", "end_time": "2022-08-04 13:08:39.588927", "duration_sec": 0.012753}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_133265314672", "function_name": "test", "start_time": "2022-08-04 13:08:39.611043", "end_time": "2022-08-04 13:08:39.611043", "duration_sec": 0.0}, {"metric_id": "id_03d47e41-08b2-4b01-a6e2-5117d647f3a1__pid_7328__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_133265314672", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 13:08:39.633044", "end_time": "2022-08-04 13:08:39.633044", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 50}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json deleted file mode 100644 index 2de26d8..0000000 --- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-06-58__id.json +++ /dev/null @@ -1 +0,0 @@ -{"benchmark_metrics": {"latency": [{"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_125399655361", "function_name": "deploy", "start_time": "2022-08-04 14:07:02.174784", "end_time": "2022-08-04 14:08:31.542018", "duration_sec": 89.367234}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_125399655361", "function_name": "setup", "start_time": "2022-08-04 14:08:31.588224", "end_time": "2022-08-04 14:08:36.247162", "duration_sec": 4.658938}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_125399655361", "function_name": "run", "start_time": "2022-08-04 14:08:36.268226", "end_time": "2022-08-04 14:08:59.486441", "duration_sec": 23.218215}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_125399655361", "function_name": "collect_run_results", "start_time": "2022-08-04 14:08:59.507670", "end_time": "2022-08-04 14:08:59.521064", "duration_sec": 0.013394}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_125399655361", "function_name": "test", "start_time": "2022-08-04 14:08:59.543152", "end_time": "2022-08-04 14:08:59.543152", "duration_sec": 0.0}, {"metric_id": "id_e7b75edd-2323-409c-9e11-71a61caf83a1__pid_17848__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_125399655361", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:08:59.562769", "end_time": "2022-08-04 14:08:59.562769", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"dockerUserLogin": "witja46", "dockerUserPassword": "J$rmakowicz1998", "jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json b/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json deleted file mode 100644 index a9558b1..0000000 --- a/experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark/benchmark_results__2022-08-04_14-28-03__id.json +++ /dev/null @@ -1 +0,0 @@ -{"benchmark_metrics": {"latency": [{"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_deploy__objHash_115109159852", "function_name": "deploy", "start_time": "2022-08-04 14:28:05.734421", "end_time": "2022-08-04 14:29:23.529874", "duration_sec": 77.795453}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_setup__objHash_115109159852", "function_name": "setup", "start_time": "2022-08-04 14:29:23.580757", "end_time": "2022-08-04 14:29:23.581278", "duration_sec": 0.000521}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_run__objHash_115109159852", "function_name": "run", "start_time": "2022-08-04 14:29:23.609547", "end_time": "2022-08-04 14:29:48.600260", "duration_sec": 24.990713}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_run_results__objHash_115109159852", "function_name": "collect_run_results", "start_time": "2022-08-04 14:29:48.624975", "end_time": "2022-08-04 14:29:48.638567", "duration_sec": 0.013592}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_test__objHash_115109159852", "function_name": "test", "start_time": "2022-08-04 14:29:48.661305", "end_time": "2022-08-04 14:29:48.661305", "duration_sec": 0.0}, {"metric_id": "id_8d73b0ab-bb06-4d69-8358-b4e1a524568d__pid_13192__hostname_DESKTOP-LK1LTO5__function-name_collect_benchmark_metrics__objHash_115109159852", "function_name": "collect_benchmark_metrics", "start_time": "2022-08-04 14:29:48.682667", "end_time": "2022-08-04 14:29:48.682667", "duration_sec": 0.0}], "resources": null, "classification": null}, "benchmark_configuration": {"resources": {"jobsCount": 5, "workerCount": 5, "loggingLevel": 20}}} \ No newline at end of file diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml index 115774d..703895f 100644 --- a/experiments/polyaxon_minikube/experiment_template.yaml +++ b/experiments/polyaxon_minikube/experiment_template.yaml @@ -16,6 +16,7 @@ component: run: kind: job container: + imagePullPolicy: IfNotPresent image: $worker_image command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index a636cd3..3665986 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-3 +name: polyaxon-study-50 matrix: kind: grid numRuns: 5 @@ -16,12 +16,13 @@ component: run: kind: job container: - image: witja46/mnist_test + imagePullPolicy: IfNotPresent + image: mnist_test command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] env: - name: STUDY_NAME - value: "polyaxon-study-3" + value: "polyaxon-study-50" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile index 5ed393d..1ef3f09 100644 --- a/experiments/polyaxon_minikube/mnist_task/Dockerfile +++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile @@ -11,6 +11,7 @@ COPY setup.py setup.py COPY ml_benchmark ml_benchmark + RUN pip install -e . CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"] diff --git a/experiments/polyaxon_minikube/mnist_task/task.py b/experiments/polyaxon_minikube/mnist_task/task.py index 3763745..6853d47 100644 --- a/experiments/polyaxon_minikube/mnist_task/task.py +++ b/experiments/polyaxon_minikube/mnist_task/task.py @@ -14,7 +14,7 @@ def train(times ,epoch): loss = 1 for x in np.arange(1,times + 1): loss = 1 - (x -1 ) / times - time.sleep(0.01) + time.sleep(0.1) msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( epoch, x , times, 100. * x / times, loss) @@ -47,21 +47,11 @@ def main(): - #TODO delete logging? - #Some logging - # timestamps format - # logging.basicConfig( - # format="%(asctime)s %(levelname)-8s %(message)s", - # datefmt="%Y-%m-%dT%H:%M:%SZ", - # level=logging.DEBUG) - # for epoch in np.arange(1,epochs+1): - # train(batch_size,epoch) - # test() #MnistTask - task = MnistTask(config_init={"epochs": 1}) + task = MnistTask(config_init={"epochs": epochs}) objective = task.create_objective() #TODO add the weight decay to the definition of the template objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01}) @@ -74,6 +64,13 @@ def main(): print("precision",avg["precision"]) print("f1-score",avg["f1-score"]) + # logging.basicConfig( + # format="%(asctime)s %(levelname)-8s %(message)s", + # datefmt="%Y-%m-%dT%H:%M:%SZ", + # level=logging.DEBUG) + # for epoch in np.arange(1,epochs+1): + # train(batch_size,epoch) + # test() # Polyaxon #initiating polyaxon tracking diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index 4bc1eb8..57b3a8b 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -1,6 +1,8 @@ +from __future__ import print_function from asyncio import subprocess from base64 import decode +from cmath import pi from concurrent.futures import process from itertools import count import json @@ -11,15 +13,22 @@ from time import sleep import random from urllib.request import urlopen +from venv import create from kubernetes import client, config,watch from kubernetes.client.rest import ApiException from string import Template import docker from ml_benchmark.benchmark_runner import Benchmark +from ml_benchmark.utils.image_build_wrapper import builder_from_string import requests import subprocess import psutil import logging as log +from polyaxon.cli.projects import create +from polyaxon.cli.run import run +from polyaxon.cli.admin import deploy,teardown +from click.testing import CliRunner + @@ -37,7 +46,8 @@ def __init__(self, resources) -> None: self.project_description = "Somer random description" self.polyaxon_addr="http://localhost:31833/" self.post_forward_process=False - + self.cli_runner=CliRunner() + config.load_kube_config() self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) @@ -57,7 +67,7 @@ def __init__(self, resources) -> None: else: self.docker_user = "witja46" - self.trial_tag = f'{self.docker_user}/{self.trial_tag}' + if "dockerUserPassword" in self.resources: self.docker_pasword = self.resources["dockerUserPassword"] @@ -101,10 +111,10 @@ def deploy(self): res = os.popen('helm repo add polyaxon https://charts.polyaxon.com').read() log.info(res) - #TODO deploy via helm for better error handling and easier configuration? log.info("Deploying polyaxon to minikube:") - res = os.popen('polyaxon admin deploy -t minikube').read() - log.info(res) + #invoking polyaxon cli deploy comand + res = self.cli_runner.invoke(deploy) + log.info(res.output) @@ -113,11 +123,10 @@ def deploy(self): config.load_kube_config() w = watch.Watch() c = client.CoreV1Api() - # c = client.AppsV1Api() deployed = 0 - log.info("Waiting for polyaxon pods to be read:") + log.info("Waiting for all polyaxon pods to be ready:") # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] # TODO changing to list_namespaced_deployments? @@ -183,54 +192,44 @@ def setup(self): log.info("Experiment yaml created") - # Creating new docker image if credentials were passed - if "dockerUserLogin" in self.resources: - - #creating task docker image - log.info("Creating task docker image") - #TODO do all this inside of minikube instead of on the client? - self.client = docker.client.from_env() - PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) - image, logs = self.client.images.build(path=PROJECT_ROOT, dockerfile="experiments/polyaxon_minikube/mnist_task/Dockerfile" ,tag=self.trial_tag) - log.info(f"Image: {self.trial_tag}") - for line in logs : - log.info(line) - + log.info("Creating task docker image") + #creating docker image inside of the minikube + self.image_builder = builder_from_string("minikube")() + PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) + self.image_builder.deploy_image( + "experiments/polyaxon_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) + print(f"Image: {self.trial_tag}") - #TODO check if the image is runing properly and there is no problems with it - #pushing to repo - self.client.login(username=self.docker_user, password=self.docker_pasword) - - log.info("Pushing image to the Dockerhub") - - for line in self.client.api.push(self.trial_tag,stream=True,decode=True): - log.info(line) + def run(self): - #TODO add error handling + #TODO add error handling. + #TODO change to invocking procject comand instead of sending the http request to the polaxon api? log.info("Creating new project:") - project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, "description": self.project_description}) - log.info(project.text) + # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, }) + + + options = f'--name {self.study_name} --description '.split() + # adding the project description as the last argument + options.append(f'{self.project_description}') + #invoking polyaxon project create comand + res = self.cli_runner.invoke(create,options) + log.info(res.output) - #TODO find out where to pass the --eager flag so that the run can be created with help of api - # with open(path.join(path.dirname(__file__), self.experiment_file_name), "r") as f: - # body = f.read() - # log.info(body) - # run = requests.post(f'{self.polyaxon_addr}/api/v1/default/{self.study_name}/runs',json={"content":body,"eager":True ,"tags":["--eager"],"meta_info":{"eager":True,"--eager":True,"flags":"--eager"}}) - # log.info(run.text) log.info("Starting polyaxon experiment:") - res = os.popen(f'polyaxon run -f ./{self.experiment_file_name} --project {self.study_name} --eager').read() - log.info(res) + #invoking polyaxon run comand with following options + options = f'-f ./{self.experiment_file_name} --project {self.study_name} --eager'.split() + res = self.cli_runner.invoke(run,options) + log.info(res.output) - #TODO check if there is no problem with the image #TODO switch to kubernetes api for monitoring runing trials log.info("Waiting for the run to finish:") @@ -242,12 +241,19 @@ def run(self): #checking if all runs were finished finished = runs["count"] == self.jobsCount sleep(1) - return + + return + + def collect_benchmark_metrics(self): - pass + log.info("Collecting run results:") + result = self.get_succeeded_runs() + log.info(json.dumps(result,indent=4)) + + return result["results"] def get_succeeded_runs(self, sort_by="duration"): @@ -279,14 +285,20 @@ def undeploy(self): for proc in process.children(recursive=True): proc.kill() process.kill() - - #TODO changing to undeploying with helm? + log.info("Undeploying polyaxon:") - p = subprocess.Popen(["polyaxon", "admin" ,"teardown"],shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE) - out,err = p.communicate(input=b"y") - p.terminate() + res = self.cli_runner.invoke(teardown,["--yes"]) + #by teardown comand the polyaxon cli doesnt set exit_code if there are some problems + if("Polyaxon could not teardown the deployment" in res.output): + raise Exception(f'Exit code: {res.exit_code} Error message: \n{res.output}') + elif(res.exit_code == 0): + print(res.exit_code) + log.info(res.output) + else: + raise Exception(f'Exit code: {res.exit_code} Error message: \n{res.output}') + @@ -341,23 +353,28 @@ def undeploy(self): if __name__ == "__main__": #main() # bench = PolyaxonBenchmark(resources={ - # "dockerUserLogin":"", - # "dockerUserPassword":"", + # # "dockerUserLogin":"", + # # "dockerUserPassword":"", # # "studyName":"" # "jobsCount":5, # "workerCount":5, - # "loggingLevel":log.INFO + # "loggingLevel":log.INFO, + # "dockerImageTag":"nowe", + + # "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() # }) - #bench.deploy() + + # bench.deploy() # bench.setup() # bench.run() - # bench.collect_run_results() - # bench.undeploy() + # # bench.collect_run_results() + + # bench.undeploy() + # polyaxon config set --host=http://localhost:8000 + resources={ - # "dockerUserLogin":"", - # "dockerUserPassword":"", - # "studyName":"" + # "studyName":"", "dockerImageTag":"mnist_test", "jobsCount":5, "workerCount":5, @@ -369,4 +386,23 @@ def undeploy(self): benchmark_cls=PolyaxonBenchmark, resources=resources) runner.run() + + # print(f'polyaxon run -f ./ --project --eager') + # print(f'polyaxon run -f ./grid --project --eager'.split()) + # runner = CliRunner() + # print("start") + # res = runner.invoke(run,["-f ./grid.yaml --eager -o json"]) + # print("fin") + # print(res.output,res.exit_code) + + # res = runner.invoke(teardown,["--yes"]) + # if("Polyaxon could not teardown the deployment" in res.output): + # print("Ja pierdole") + # print(res.output,res.exit_code,res.exception) + # print("stop") + # res = runner.invoke(deploy) + # print(res.output,res.exit_code) + + # print(res) + diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt index 4e272bb..a1c3b00 100644 --- a/experiments/polyaxon_minikube/requirements.txt +++ b/experiments/polyaxon_minikube/requirements.txt @@ -1 +1,2 @@ polyaxon +kubernetes From bfeb739f36c10d522ce7274e03b411aae07b4053 Mon Sep 17 00:00:00 2001 From: witja46 Date: Mon, 22 Aug 2022 12:27:17 +0200 Subject: [PATCH 7/9] Added tracking of Undeploy, added lighter image for faster testing --- .../experiment_template.yaml | 2 +- experiments/polyaxon_minikube/grid.yaml | 8 ++--- .../polyaxon_minikube/mnist_task/Dockerfile | 2 +- .../polyaxon_minikube/polyaxon_benchmark.py | 30 ++++++++++++------- .../polyaxon_minikube/requirements.txt | 1 + ml_benchmark/benchmark_runner.py | 2 +- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/experiments/polyaxon_minikube/experiment_template.yaml b/experiments/polyaxon_minikube/experiment_template.yaml index 703895f..6db4ece 100644 --- a/experiments/polyaxon_minikube/experiment_template.yaml +++ b/experiments/polyaxon_minikube/experiment_template.yaml @@ -18,7 +18,7 @@ component: container: imagePullPolicy: IfNotPresent image: $worker_image - command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] + command: [python3, experiments/polyaxon_minikube/$worker_image/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] env: - name: STUDY_NAME diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 3665986..70606bf 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-50 +name: polyaxon-study-75 matrix: kind: grid numRuns: 5 @@ -17,12 +17,12 @@ component: kind: job container: imagePullPolicy: IfNotPresent - image: mnist_test - command: [python3, experiments/polyaxon_minikube/mnist_task/task.py] + image: task_light + command: [python3, experiments/polyaxon_minikube/task_light/task.py] args: ["--batch-size=64", "--lr={{ lr }}"] env: - name: STUDY_NAME - value: "polyaxon-study-50" + value: "polyaxon-study-75" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/polyaxon_minikube/mnist_task/Dockerfile b/experiments/polyaxon_minikube/mnist_task/Dockerfile index 1ef3f09..b4701bb 100644 --- a/experiments/polyaxon_minikube/mnist_task/Dockerfile +++ b/experiments/polyaxon_minikube/mnist_task/Dockerfile @@ -6,7 +6,7 @@ RUN pip install pip --upgrade RUN pip install polyaxon COPY experiments experiments -COPY data data +#COPY data data COPY setup.py setup.py COPY ml_benchmark ml_benchmark diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index 57b3a8b..cab7487 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -52,6 +52,7 @@ def __init__(self, resources) -> None: self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) + self.create_clean_image = self.resources.get("createCleanImage",True) log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level) @@ -60,7 +61,7 @@ def __init__(self, resources) -> None: if "dockerImageTag" in self.resources: self.trial_tag = self.resources["dockerImageTag"] else: - self.trial_tag = "mnist_test" + self.trial_tag = "mnist_task" if "dockerUserLogin" in self.resources: self.docker_user = self.resources["dockerUserLogin"] @@ -191,14 +192,16 @@ def setup(self): f.write(job_yml_objects) log.info("Experiment yaml created") - - log.info("Creating task docker image") - #creating docker image inside of the minikube - self.image_builder = builder_from_string("minikube")() - PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) - self.image_builder.deploy_image( - "experiments/polyaxon_minikube/mnist_task/Dockerfile", self.trial_tag,PROJECT_ROOT) - print(f"Image: {self.trial_tag}") + + if self.create_clean_image: + log.info("Creating task docker image") + #creating docker image inside of the minikube + self.image_builder = builder_from_string("minikube")() + PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) + log.info(PROJECT_ROOT) + self.image_builder.deploy_image( + f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT) + print(f"Image: {self.trial_tag}") @@ -347,6 +350,9 @@ def undeploy(self): w.stop() break + + log.info("Deleting image from minikube") + self.image_builder.cleanup(self.trial_tag) log.info("Finished undeploying") @@ -375,11 +381,13 @@ def undeploy(self): resources={ # "studyName":"", - "dockerImageTag":"mnist_test", + "dockerImageTag":"task_light", "jobsCount":5, + "workerCount":5, "loggingLevel":log.INFO, - "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() + "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), + "createCleanImage":True } from ml_benchmark.benchmark_runner import BenchmarkRunner runner = BenchmarkRunner( diff --git a/experiments/polyaxon_minikube/requirements.txt b/experiments/polyaxon_minikube/requirements.txt index a1c3b00..31e30f0 100644 --- a/experiments/polyaxon_minikube/requirements.txt +++ b/experiments/polyaxon_minikube/requirements.txt @@ -1,2 +1,3 @@ polyaxon kubernetes +torch \ No newline at end of file diff --git a/ml_benchmark/benchmark_runner.py b/ml_benchmark/benchmark_runner.py index d290b0b..8bd0d25 100644 --- a/ml_benchmark/benchmark_runner.py +++ b/ml_benchmark/benchmark_runner.py @@ -142,7 +142,7 @@ def run(self): run_process = [ self.benchmark.deploy, self.benchmark.setup, self.benchmark.run, self.benchmark.collect_run_results, - self.benchmark.test, self.benchmark.collect_benchmark_metrics] + self.benchmark.test, self.benchmark.collect_benchmark_metrics,self.benchmark.undeploy] benchmark_results = None try: From 8c43ae530a6d5c606516747f3773681c6fb98074 Mon Sep 17 00:00:00 2001 From: witja46 Date: Mon, 22 Aug 2022 12:53:06 +0200 Subject: [PATCH 8/9] Throwing exeption in setup phase if task image was not created successfully --- experiments/polyaxon_minikube/grid.yaml | 4 +- .../polyaxon_minikube/polyaxon_benchmark.py | 26 ++++--- .../polyaxon_minikube/task_light/Dockerfile | 16 +++++ .../polyaxon_minikube/task_light/README.md | 11 +++ .../task_light/requirements.txt | 2 + .../polyaxon_minikube/task_light/task.py | 69 +++++++++++++++++++ ml_benchmark/utils/image_build_wrapper.py | 2 +- 7 files changed, 116 insertions(+), 14 deletions(-) create mode 100644 experiments/polyaxon_minikube/task_light/Dockerfile create mode 100644 experiments/polyaxon_minikube/task_light/README.md create mode 100644 experiments/polyaxon_minikube/task_light/requirements.txt create mode 100644 experiments/polyaxon_minikube/task_light/task.py diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 70606bf..0e4071e 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,6 +1,6 @@ version: 1.1 kind: operation -name: polyaxon-study-75 +name: polyaxon-study-6 matrix: kind: grid numRuns: 5 @@ -22,7 +22,7 @@ component: args: ["--batch-size=64", "--lr={{ lr }}"] env: - name: STUDY_NAME - value: "polyaxon-study-75" + value: "polyaxon-study-6" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index cab7487..0ddb940 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -199,8 +199,11 @@ def setup(self): self.image_builder = builder_from_string("minikube")() PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) log.info(PROJECT_ROOT) - self.image_builder.deploy_image( + res = self.image_builder.deploy_image( f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT) + print(res) + if f'Successfully tagged {self.trial_tag}' not in res: + raise Exception("Image was not created:",res) print(f"Image: {self.trial_tag}") @@ -352,7 +355,7 @@ def undeploy(self): log.info("Deleting image from minikube") - self.image_builder.cleanup(self.trial_tag) + #self.image_builder.cleanup(self.trial_tag) log.info("Finished undeploying") @@ -370,12 +373,6 @@ def undeploy(self): # "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip() # }) - # bench.deploy() - # bench.setup() - # bench.run() - # # bench.collect_run_results() - - # bench.undeploy() # polyaxon config set --host=http://localhost:8000 @@ -390,10 +387,17 @@ def undeploy(self): "createCleanImage":True } from ml_benchmark.benchmark_runner import BenchmarkRunner - runner = BenchmarkRunner( - benchmark_cls=PolyaxonBenchmark, resources=resources) - runner.run() + #runner = BenchmarkRunner( + # benchmark_cls=PolyaxonBenchmark, resources=resources) + #runner.run() + bench= PolyaxonBenchmark(resources=resources) + #bench.deploy() + bench.setup() + # bench.run() + # # bench.collect_run_results() + + # bench.undeploy() # print(f'polyaxon run -f ./ --project --eager') # print(f'polyaxon run -f ./grid --project --eager'.split()) diff --git a/experiments/polyaxon_minikube/task_light/Dockerfile b/experiments/polyaxon_minikube/task_light/Dockerfile new file mode 100644 index 0000000..f3cf347 --- /dev/null +++ b/experiments/polyaxon_minikube/task_light/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.9-slim + +RUN pip install pip --upgrade +RUN pip install polyaxon + + +COPY experiments experiments +# COPY data data +COPY setup.py setup.py +COPY ml_benchmark ml_benchmark + + + +RUN pip install -e . + +CMD ["python", "experiments/katib_minikube/task_light/task.py"] \ No newline at end of file diff --git a/experiments/polyaxon_minikube/task_light/README.md b/experiments/polyaxon_minikube/task_light/README.md new file mode 100644 index 0000000..3326bc7 --- /dev/null +++ b/experiments/polyaxon_minikube/task_light/README.md @@ -0,0 +1,11 @@ +# PyTorch MNIST Image Classification Example + +This is PyTorch MNIST image classification training container with saving metrics +to the file or printing to the StdOut. It uses convolutional neural network to +train the model. + +Katib uses this training container in some Experiments, for instance in the +[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64), +the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62), +the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71) +and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54). diff --git a/experiments/polyaxon_minikube/task_light/requirements.txt b/experiments/polyaxon_minikube/task_light/requirements.txt new file mode 100644 index 0000000..9852601 --- /dev/null +++ b/experiments/polyaxon_minikube/task_light/requirements.txt @@ -0,0 +1,2 @@ +Pillow>=9.1.1 +numpy \ No newline at end of file diff --git a/experiments/polyaxon_minikube/task_light/task.py b/experiments/polyaxon_minikube/task_light/task.py new file mode 100644 index 0000000..0f60460 --- /dev/null +++ b/experiments/polyaxon_minikube/task_light/task.py @@ -0,0 +1,69 @@ +from __future__ import print_function + +from polyaxon import tracking +import argparse +import logging +import os +import numpy as np +import time +from ml_benchmark.latency_tracker import latency_decorator + +@latency_decorator +def train(times ,epochs): + loss = 1 + for epoch in range(epochs): + for x in np.arange(1,times + 1): + loss = 1 - (x -1 ) / times + time.sleep(0.01) + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, x , times, + 100. * x / times, loss) + logging.info(msg) + + +def test(): + + test_accuracy =0.8 + logging.info("Validation-accuracy={:.4f}\n".format( + test_accuracy)) + + +def main(): + + #parsing arguments + parser = argparse.ArgumentParser(description="MNIST Example") + parser.add_argument("--batch-size", type=int, default=64, metavar="N", + help="input batch size for training (default: 64)") + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--lr", type=float, default=0.01, metavar="LR", + help="learning rate (default: 0.01)") + args = parser.parse_args() + + #timestamps format + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG) + + + #model taining and testing + epochs = args.epochs + batch_size = args.batch_size + lr = args.lr + + train(batch_size,epochs) + test() + avg = {"recall":0.5,"f1-score":0.98,"precision":0.7} + # Polyaxon + #initiating polyaxon tracking + tracking.init() + #loging metrics + tracking.log_metrics(recall=avg["recall"], ) + #logging the results + tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"]) + + + +if __name__ == "__main__": + main() diff --git a/ml_benchmark/utils/image_build_wrapper.py b/ml_benchmark/utils/image_build_wrapper.py index e88b366..9c6990c 100644 --- a/ml_benchmark/utils/image_build_wrapper.py +++ b/ml_benchmark/utils/image_build_wrapper.py @@ -35,7 +35,7 @@ def deploy_image(self, image, tag, build_context): if call.returncode != 0: print(call.stderr.decode("utf-8").strip("\n")) raise Exception("Failed to deploy image") - print("IMAGE IMAGE ", call.stdout, call.stderr) +# print("IMAGE IMAGE ", call.stdout, call.stderr) return call.stdout.decode("utf-8").strip("\n") From 86fb3e882c3c47e3199b53bf21ac45f70c25fc4a Mon Sep 17 00:00:00 2001 From: witja46 Date: Tue, 30 Aug 2022 00:00:17 +0200 Subject: [PATCH 9/9] Small clean up. Chaneged to monitoring runing trials with kubernetes api instead of the polyaxon api --- experiments/polyaxon_minikube/grid.yaml | 8 +- .../polyaxon_minikube/polyaxon_benchmark.py | 167 +++++++++--------- 2 files changed, 83 insertions(+), 92 deletions(-) diff --git a/experiments/polyaxon_minikube/grid.yaml b/experiments/polyaxon_minikube/grid.yaml index 0e4071e..a54550f 100644 --- a/experiments/polyaxon_minikube/grid.yaml +++ b/experiments/polyaxon_minikube/grid.yaml @@ -1,14 +1,14 @@ version: 1.1 kind: operation -name: polyaxon-study-6 +name: polyaxon-study-57 matrix: kind: grid - numRuns: 5 + numRuns: 15 concurrency: 5 params: lr: kind: logspace - value: 0.01:0.1:5 + value: 0.01:0.1:15 component: inputs: - name: lr @@ -22,7 +22,7 @@ component: args: ["--batch-size=64", "--lr={{ lr }}"] env: - name: STUDY_NAME - value: "polyaxon-study-6" + value: "polyaxon-study-57" - name: DB_CONN value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb" - name: "METRICS_STORAGE_HOST" diff --git a/experiments/polyaxon_minikube/polyaxon_benchmark.py b/experiments/polyaxon_minikube/polyaxon_benchmark.py index 0ddb940..8591ad9 100644 --- a/experiments/polyaxon_minikube/polyaxon_benchmark.py +++ b/experiments/polyaxon_minikube/polyaxon_benchmark.py @@ -4,10 +4,12 @@ from base64 import decode from cmath import pi from concurrent.futures import process +from importlib.abc import ResourceReader from itertools import count import json from os import path import os +import re from socket import timeout import sys from time import sleep @@ -50,56 +52,18 @@ def __init__(self, resources) -> None: config.load_kube_config() - self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) - + self.clean_up = self.resources.get("cleanUp",True) self.create_clean_image = self.resources.get("createCleanImage",True) - log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level) - - self.metrics_ip = resources.get("metricsIP") - - if "dockerImageTag" in self.resources: - self.trial_tag = self.resources["dockerImageTag"] - else: - self.trial_tag = "mnist_task" - - if "dockerUserLogin" in self.resources: - self.docker_user = self.resources["dockerUserLogin"] - else: - self.docker_user = "witja46" - - + self.trial_tag = resources.get("dockerImageTag", "mnist_task") + self.study_name = resources.get("studyName",f'polyaxon-study-{random.randint(0, 100)}') + self.workerCpu=resources.get("workerCpu",2) + self.workerMemory=resources.get("workerMemory",2) + self.workerCount=resources.get("workerCount",5) + self.jobsCount=resources.get("jobsCount",6) - if "dockerUserPassword" in self.resources: - self.docker_pasword = self.resources["dockerUserPassword"] - else: - self.docker_pasword = "" - - - if "studyName" in self.resources: - self.study_name = self.resources["studyName"] - else: - self.study_name = f"polyaxon-study-{random.randint(0, 100)}" - - if "workerCpu" in self.resources: - self.workerCpu = self.resources["workerCpu"] - else: - self.workerCpu = 2 - - if "workerMemory" in resources: - self.workerMemory = self.resources["workerMemory"] - else: - self.workerMemory = 2 - - if "workerCount" in resources: - self.workerCount = self.resources["workerCount"] - else: - self.workerCount = 5 - - if "jobsCount" in resources: - self.jobsCount = self.resources["jobsCount"] - else: - self.jobsCount = 6 + self.logging_level= self.resources.get("loggingLevel",log.CRITICAL) + log.basicConfig(format='%(asctime)s Polyaxon Benchmark %(levelname)s: %(message)s',level=self.logging_level) def deploy(self): """ @@ -121,32 +85,30 @@ def deploy(self): - config.load_kube_config() - w = watch.Watch() - c = client.CoreV1Api() - deployed = 0 log.info("Waiting for all polyaxon pods to be ready:") + config.load_kube_config() + w = watch.Watch() + c = client.CoreV1Api() + # From all pods that polyaxon starts we are onlly really intrested for following 4 that are crucial for runnig of the experiments - monitored_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] - # TODO changing to list_namespaced_deployments? + unready_pods = ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] for e in w.stream(c.list_namespaced_pod, namespace="polyaxon"): ob = e["object"] - for name in monitored_pods: + for name in unready_pods: #checking if it is one of the pods that we want to monitor if name in ob.metadata.name: - # Checking if the pod already is runnig and its underlying containers are ready + # Checking if the pod already is runnig and its underlying containers are ready, if yes we do not need to monitor it anymore if ob.status.phase == "Running" and ob.status.container_statuses[0].ready: log.info(f'{ob.metadata.name} is ready') - monitored_pods.remove(name) - deployed = deployed + 1 + unready_pods.remove(name) - #if all monitored pods are running the deployment process was ended - if(deployed == 4 ): + #if all monitored pods are ready the deployment process was ended + if not unready_pods: w.stop() log.info("Finished deploying crucial pods") @@ -195,16 +157,18 @@ def setup(self): if self.create_clean_image: log.info("Creating task docker image") - #creating docker image inside of the minikube self.image_builder = builder_from_string("minikube")() PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../")) - log.info(PROJECT_ROOT) + + #creating docker image inside of the minikube res = self.image_builder.deploy_image( f'experiments/polyaxon_minikube/{self.trial_tag}/Dockerfile', self.trial_tag,PROJECT_ROOT) - print(res) + + log.info(res) + #if something went wrong by creation of the image benchmark schould be stoped if f'Successfully tagged {self.trial_tag}' not in res: raise Exception("Image was not created:",res) - print(f"Image: {self.trial_tag}") + log.info(f"Image: {self.trial_tag}") @@ -213,16 +177,16 @@ def setup(self): def run(self): - #TODO add error handling. - #TODO change to invocking procject comand instead of sending the http request to the polaxon api? - log.info("Creating new project:") - # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, }) + # project = requests.post(f'{self.polyaxon_addr}/api/v1/default/projects/create', json={"name": self.study_name, }) # Alternative way of creating the project crd with http request to the polyaxon api + log.info("Creating new project:") options = f'--name {self.study_name} --description '.split() # adding the project description as the last argument options.append(f'{self.project_description}') + #invoking polyaxon project create comand + #TODO add error handling. res = self.cli_runner.invoke(create,options) log.info(res.output) @@ -231,7 +195,8 @@ def run(self): log.info("Starting polyaxon experiment:") #invoking polyaxon run comand with following options - options = f'-f ./{self.experiment_file_name} --project {self.study_name} --eager'.split() + options = f'-f {self.experiment_file_name} --project {self.study_name} --eager'.split() + #TODO add error handling. res = self.cli_runner.invoke(run,options) log.info(res.output) @@ -240,13 +205,29 @@ def run(self): #TODO switch to kubernetes api for monitoring runing trials log.info("Waiting for the run to finish:") finished = False - while not finished: - runs = self.get_succeeded_runs() - log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded') + + w = watch.Watch() + c = client.BatchV1Api() + done = 0 + for e in w.stream(c.list_namespaced_job, namespace=self.namespace): + if "object" in e and e["object"].status.completion_time is not None: + done = done + 1 + log.info(f'{done} jobs out of {self.jobsCount} succeded') + if(done == self.jobsCount): + log.info("Finished all runs") + w.stop() + + + + + + # while not finished: + # runs = self.get_succeeded_runs() + # log.info(f'{runs["count"]} jobs out of {self.jobsCount} succeded') - #checking if all runs were finished - finished = runs["count"] == self.jobsCount - sleep(1) + # #checking if all runs were finished + # finished = runs["count"] == self.jobsCount + # sleep(1) return @@ -315,6 +296,7 @@ def undeploy(self): w = watch.Watch() c = client.CoreV1Api() deployed = 0 + to_undeploy= ["polyaxon-polyaxon-streams","polyaxon-polyaxon-operator","polyaxon-polyaxon-gateway","polyaxon-polyaxon-api"] log.info("Waiting for polyaxon pods to be terminated:") for e in w.stream(c.list_namespaced_pod, namespace=self.namespace): ob = e["object"] @@ -322,14 +304,17 @@ def undeploy(self): log.debug(f'{deployed} pods out of 4 were killed') log.debug("\n new in stream:\n") log.debug(ob.metadata.name,ob.status.phase) + for name in to_undeploy: + if name in ob.metadata.name: - if not ob.status.container_statuses[0].ready: - log.info(f'Containers of {ob.metadata.name} are terminated') - deployed = deployed + 1 - if(deployed == 4 ): - w.stop() - # log.info("Finished ") - break + if not ob.status.container_statuses[0].ready: + log.info(f'Containers of {ob.metadata.name} are terminated') + to_undeploy.remove(name) + + if not to_undeploy: + w.stop() + # log.info("Finished ") + break log.info("Killed all pods deleteing the namespace:") @@ -350,6 +335,10 @@ def undeploy(self): except ApiException as err: log.info(err) log.info("Namespace sucessfully deleted") + if self.clean_up: + log.info("Deleteing task docker image from minikube") + sleep(2) + self.image_builder.cleanup(self.trial_tag) w.stop() break @@ -379,21 +368,22 @@ def undeploy(self): resources={ # "studyName":"", "dockerImageTag":"task_light", - "jobsCount":5, - + "jobsCount":15, + "cleanUp":False, +"createCleanImage":False, "workerCount":5, "loggingLevel":log.INFO, "metricsIP": urlopen("https://checkip.amazonaws.com").read().decode("utf-8").strip(), "createCleanImage":True } from ml_benchmark.benchmark_runner import BenchmarkRunner - #runner = BenchmarkRunner( - # benchmark_cls=PolyaxonBenchmark, resources=resources) - #runner.run() + runner = BenchmarkRunner( + benchmark_cls=PolyaxonBenchmark, resources=resources) + runner.run() - bench= PolyaxonBenchmark(resources=resources) + #bench= PolyaxonBenchmark(resources=resources) #bench.deploy() - bench.setup() + #bench.setup() # bench.run() # # bench.collect_run_results() @@ -418,3 +408,4 @@ def undeploy(self): # print(res) +