Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,6 @@ dmypy.json
exp__*
experiments/simple_raytune/benchmark__RaytuneBenchmark
experiments/optuna_minikube/benchmark__OptunaMinikubeBenchmark
experiments/katib_minikube/benchmark__KatibBenchmark

data/
52 changes: 52 additions & 0 deletions experiments/katib_minikube/experiment_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: $study_name
spec:
objective:
type: maximize
goal: 0.8
objectiveMetricName: precision
algorithm:
algorithmName: grid
parallelTrialCount: $worker_num
maxTrialCount: $jobs_num
maxFailedTrialCount: $worker_num
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.001"
max: "0.02"
step: "0.001"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: $worker_image
imagePullPolicy: IfNotPresent
command:
- "python3"
- "experiments/katib_minikube/$worker_image/task.py"
- "--batch-size=64"
- "--lr=$trialParameters"
env:
- name: STUDY_NAME
value: "$study_name"
- name: DB_CONN
value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
- name: "METRICS_STORAGE_HOST"
value: "$metrics_ip"
restartPolicy: Never
52 changes: 52 additions & 0 deletions experiments/katib_minikube/grid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: katib-study-32
spec:
objective:
type: maximize
goal: 0.8
objectiveMetricName: precision
algorithm:
algorithmName: grid
parallelTrialCount: 2
maxTrialCount: 2
maxFailedTrialCount: 2
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.001"
max: "0.02"
step: "0.001"
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: mnist_task
imagePullPolicy: IfNotPresent
command:
- "python3"
- "experiments/katib_minikube/mnist_task/task.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
env:
- name: STUDY_NAME
value: "katib-study-32"
- name: DB_CONN
value: "postgresql://postgresadmin:admin123@postgres:5432/postgresdb"
- name: "METRICS_STORAGE_HOST"
value: "134.101.4.161"
restartPolicy: Never
Loading