Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,6 @@ dmypy.json
exp__*
experiments/simple_raytune/benchmark__RaytuneBenchmark
experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
experiments/polyaxon_minikube/benchmark__PolyaxonBenchmark

data/
29 changes: 29 additions & 0 deletions experiments/polyaxon_minikube/experiment_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: 1.1
kind: operation
name: $study_name
matrix:
kind: grid
numRuns: $jobs_num
concurrency: $worker_num
params:
lr:
kind: logspace
value: 0.01:0.1:$jobs_num
component:
inputs:
- name: lr
type: float
run:
kind: job
container:
imagePullPolicy: IfNotPresent
image: $worker_image
command: [python3, experiments/polyaxon_minikube/$worker_image/task.py]
args: ["--batch-size=64", "--lr={{ lr }}"]
env:
- name: STUDY_NAME
value: "$study_name"
- name: DB_CONN
value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
- name: "METRICS_STORAGE_HOST"
value: "$metrics_ip"
29 changes: 29 additions & 0 deletions experiments/polyaxon_minikube/grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: 1.1
kind: operation
name: polyaxon-study-57
matrix:
kind: grid
numRuns: 15
concurrency: 5
params:
lr:
kind: logspace
value: 0.01:0.1:15
component:
inputs:
- name: lr
type: float
run:
kind: job
container:
imagePullPolicy: IfNotPresent
image: task_light
command: [python3, experiments/polyaxon_minikube/task_light/task.py]
args: ["--batch-size=64", "--lr={{ lr }}"]
env:
- name: STUDY_NAME
value: "polyaxon-study-57"
- name: DB_CONN
value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
- name: "METRICS_STORAGE_HOST"
value: "134.101.4.161"
17 changes: 17 additions & 0 deletions experiments/polyaxon_minikube/mnist_task/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#TODO change to some slimer base image
FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
#FROM python:3.9-slim

RUN pip install pip --upgrade
RUN pip install polyaxon

COPY experiments experiments
#COPY data data
COPY setup.py setup.py
COPY ml_benchmark ml_benchmark



RUN pip install -e .

CMD ["python", "experiments/polyaxon_minikube/mnist_task/task.py"]
11 changes: 11 additions & 0 deletions experiments/polyaxon_minikube/mnist_task/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# PyTorch MNIST Image Classification Example

This is PyTorch MNIST image classification training container with saving metrics
to the file or printing to the StdOut. It uses convolutional neural network to
train the model.

Katib uses this training container in some Experiments, for instance in the
[file Metrics Collector example](../../metrics-collector/file-metrics-collector.yaml#L55-L64),
the [file Metrics Collector with logs in JSON format example](../../metrics-collector/file-metrics-collector-with-json-format.yaml#L52-L62),
the [median stopping early stopping rule with logs in JSON format example](../../early-stopping/median-stop-with-json-format.yaml#L62-L71)
and the [PyTorchJob example](../../kubeflow-training-operator/pytorchjob-mnist.yaml#L47-L54).
3 changes: 3 additions & 0 deletions experiments/polyaxon_minikube/mnist_task/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Pillow>=9.1.1
numpy
polyaxon
85 changes: 85 additions & 0 deletions experiments/polyaxon_minikube/mnist_task/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from polyaxon import tracking
import argparse
import logging
import numpy as np
import time
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(__file__ ,"../../../../../"))
sys.path.append(PROJECT_ROOT)
from ml_benchmark.workload.mnist.mnist_task import MnistTask


def train(times ,epoch):
loss = 1
for x in np.arange(1,times + 1):
loss = 1 - (x -1 ) / times
time.sleep(0.1)
msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
epoch, x , times,
100. * x / times, loss)
logging.info(msg)


def test():

test_accuracy =0.8
logging.info("Validation-accuracy={:.4f}\n".format(
test_accuracy))



def main():


#parsing arguments
parser = argparse.ArgumentParser(description="MNIST Example")
parser.add_argument("--batch-size", type=int, default=64, metavar="N",
help="input batch size for training (default: 64)")
parser.add_argument("--epochs", type=int, default=10, metavar="N",
help="number of epochs to train (default: 10)")
parser.add_argument("--lr", type=float, default=0.01, metavar="LR",
help="learning rate (default: 0.01)")
args = parser.parse_args()
epochs = args.epochs
batch_size = args.batch_size
lr = args.lr






#MnistTask
task = MnistTask(config_init={"epochs": epochs})
objective = task.create_objective()
#TODO add the weight decay to the definition of the template
objective.set_hyperparameters({"learning_rate": lr, "weight_decay": 0.01})
objective.train()
validation_scores = objective.validate()


#Geting results
avg = validation_scores["weighted avg"]
print("precision",avg["precision"])
print("f1-score",avg["f1-score"])

# logging.basicConfig(
# format="%(asctime)s %(levelname)-8s %(message)s",
# datefmt="%Y-%m-%dT%H:%M:%SZ",
# level=logging.DEBUG)
# for epoch in np.arange(1,epochs+1):
# train(batch_size,epoch)
# test()

# Polyaxon
#initiating polyaxon tracking
tracking.init()
#loging metrics
tracking.log_metrics(recall=avg["recall"], )
#logging the results
tracking.log_outputs(f1score=avg["f1-score"],precision=avg["precision"])


if __name__ == "__main__":
main()
Loading