Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cloudbuild/macrobenchmarks/ingest.sql
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ BEGIN
"INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` (run_date, build_id, run_timestamp, source_uri, branch_name, ",
columns_list,
") SELECT PARSE_DATE('%Y%m%d', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8})/')) as run_date, ",
"REGEXP_EXTRACT(_FILE_NAME, r'/buildid-([0-9a-fA-F-]{36})/') as build_id, ",
"REGEXP_EXTRACT(_FILE_NAME, r'/buildid-([^/]+)/') as build_id, ",
"PARSE_TIMESTAMP('%Y%m%d-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8}-\\d{6})\\.csv')) as run_timestamp, ",
"_FILE_NAME as source_uri, ",
"REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, ",
Expand Down
3 changes: 3 additions & 0 deletions cloudbuild/macrobenchmarks/macrobenchmarks-cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,12 @@ steps:
- "_WORKLOAD=${_WORKLOAD}"
- "_DATASET_PATH=${_DATASET_PATH}"
- "_MODEL_ID=${_MODEL_ID}"
- "_IMAGE=${_IMAGE}"
- "_TRAINING_STRATEGY=${_TRAINING_STRATEGY}"
- "_HF_TOKEN=${_HF_TOKEN}"
- "_NODES=${_NODES}"
- "_RANKS_PER_NODE=${_RANKS_PER_NODE}"
- "_MACHINE_TYPE=${_MACHINE_TYPE}"
- "_REQUIREMENTS=${_REQUIREMENTS}"
- "_SEED_CHECKPOINT=${_SEED_CHECKPOINT}"
- "_CHECKPOINT_LOAD_PATH=${_CHECKPOINT_LOAD_PATH}"
Expand Down
15 changes: 15 additions & 0 deletions cloudbuild/macrobenchmarks/scripts/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ skip_if_failed() {
fi
}

shared_workload_helm_args() {
SHARED_HELM_ARGS=(
--set gcsfs.datasetPath="${_DATASET_PATH}"
--set workload.modelId="${_MODEL_ID}"
--set-string workload.image="${_IMAGE}"
--set workload.hfToken="${_HF_TOKEN}"
--set workload.nodes="${_NODES}"
--set workload.ranksPerNode="${_RANKS_PER_NODE}"
--set workload.requirements="${_REQUIREMENTS}"
--set workload.trainingStrategy="${_TRAINING_STRATEGY}"
--set "nodeSelector.cloud\.google\.com/gke-nodepool=${_MACHINE_TYPE}"
Comment thread
zhixiangli marked this conversation as resolved.
--set serviceAccount=default
)
}

# Poll a JobSet until it reports Completed (return 0) or Failed/timeout (record
# the failure in the ledger, dump diagnostics, return 1). Shared by the
# seed-checkpoint and run-workload steps so the 240x30s poll lives in one place.
Expand Down
14 changes: 3 additions & 11 deletions cloudbuild/macrobenchmarks/scripts/run_workload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,18 @@ if [ -z "$EFFECTIVE_LOAD_PATH" ] && [ "${_SEED_CHECKPOINT}" = "true" ]; then
EFFECTIVE_LOAD_PATH="${SEEDED_CKPT_PATH:-}"
fi
echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > /workspace/start_time.txt
shared_workload_helm_args
helm install "$RUN_ID" "$CHART" -f "$CHART/values_base.yaml" \
--set gcsfs.datasetPath="${_DATASET_PATH}" \
"${SHARED_HELM_ARGS[@]}" \
--set gcsfs.ckptWritePath="gs://$CHECKPOINT_BUCKET/checkpoints" \
--set-string gcsfs.ckptLoadPath="${EFFECTIVE_LOAD_PATH}" \
--set workload.modelId="${_MODEL_ID}" \
--set-string workload.image="${_IMAGE}" \
--set workload.hfToken="${_HF_TOKEN}" \
--set workload.steps="${_STEPS}" \
--set workload.ckptWriterInterval="${_CHECKPOINT_INTERVAL}" \
--set workload.ckptToKeep="${_CKPT_TO_KEEP}" \
--set workload.nodes="${_NODES}" \
--set workload.ranksPerNode="${_RANKS_PER_NODE}" \
--set workload.perDeviceBatch="${_PER_DEVICE_BATCH}" \
--set workload.gradAccum="${_GRAD_ACCUM}" \
--set workload.dataloaderWorkers="${_DATALOADER_WORKERS}" \
--set workload.requirements="${_REQUIREMENTS}" \
--set workload.trainingStrategy="${_TRAINING_STRATEGY}" \
--set workload.simulatedStepComputeSeconds="${_SIMULATED_STEP_COMPUTE_SECONDS}" \
--set "nodeSelector.cloud\.google\.com/gke-nodepool=${_MACHINE_TYPE}" \
--set serviceAccount=default
--set workload.simulatedStepComputeSeconds="${_SIMULATED_STEP_COMPUTE_SECONDS}"
if ! wait_for_jobset "$RUN_ID" run-workload; then
exit 1
fi
Expand Down
11 changes: 3 additions & 8 deletions cloudbuild/macrobenchmarks/scripts/seed_checkpoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,14 @@ CHART="gcsfs/tests/perf/macrobenchmarks/workloads/${_WORKLOAD}/helm_chart"
# eagerly-materialized AdamW state are serialized the same as in a long run.
# simulatedStepComputeSeconds=0 makes the single step instant.
echo "Installing seed release $SEED_RUN_ID to write one checkpoint to $SEED_CKPT_DIR ..."
shared_workload_helm_args
helm install "$SEED_RUN_ID" "$CHART" -f "$CHART/values_base.yaml" \
--set gcsfs.datasetPath="${_DATASET_PATH}" \
"${SHARED_HELM_ARGS[@]}" \
--set gcsfs.ckptWritePath="$SEED_CKPT_DIR" \
--set-string gcsfs.ckptLoadPath="" \
--set workload.modelId="${_MODEL_ID}" \
--set workload.hfToken="${_HF_TOKEN}" \
--set workload.steps="1" \
--set workload.ckptWriterInterval="1" \
--set workload.nodes="${_NODES}" \
--set workload.requirements="${_REQUIREMENTS}" \
--set workload.trainingStrategy="${_TRAINING_STRATEGY}" \
--set workload.simulatedStepComputeSeconds="0" \
--set serviceAccount=default
--set workload.simulatedStepComputeSeconds="0"

if ! wait_for_jobset "$SEED_RUN_ID" seed-checkpoint; then
helm uninstall "$SEED_RUN_ID" || true
Expand Down
Loading