diff --git a/cloudbuild/macrobenchmarks/ingest.sql b/cloudbuild/macrobenchmarks/ingest.sql index 7c5ac0f9..8db0cfc7 100644 --- a/cloudbuild/macrobenchmarks/ingest.sql +++ b/cloudbuild/macrobenchmarks/ingest.sql @@ -46,7 +46,7 @@ BEGIN "INSERT INTO `@PROJECT_ID@.@DATASET_NAME@.history` (run_date, build_id, run_timestamp, source_uri, branch_name, ", columns_list, ") SELECT PARSE_DATE('%Y%m%d', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8})/')) as run_date, ", - "REGEXP_EXTRACT(_FILE_NAME, r'/buildid-([0-9a-fA-F-]{36})/') as build_id, ", + "REGEXP_EXTRACT(_FILE_NAME, r'/buildid-([^/]+)/') as build_id, ", "PARSE_TIMESTAMP('%Y%m%d-%H%M%S', REGEXP_EXTRACT(_FILE_NAME, r'/(\\d{8}-\\d{6})\\.csv')) as run_timestamp, ", "_FILE_NAME as source_uri, ", "REGEXP_EXTRACT(_FILE_NAME, r'/branch=([^/]+)/') as branch_name, ", diff --git a/cloudbuild/macrobenchmarks/macrobenchmarks-cloudbuild.yaml b/cloudbuild/macrobenchmarks/macrobenchmarks-cloudbuild.yaml index 809e5e13..4cba2411 100644 --- a/cloudbuild/macrobenchmarks/macrobenchmarks-cloudbuild.yaml +++ b/cloudbuild/macrobenchmarks/macrobenchmarks-cloudbuild.yaml @@ -120,9 +120,12 @@ steps: - "_WORKLOAD=${_WORKLOAD}" - "_DATASET_PATH=${_DATASET_PATH}" - "_MODEL_ID=${_MODEL_ID}" + - "_IMAGE=${_IMAGE}" - "_TRAINING_STRATEGY=${_TRAINING_STRATEGY}" - "_HF_TOKEN=${_HF_TOKEN}" - "_NODES=${_NODES}" + - "_RANKS_PER_NODE=${_RANKS_PER_NODE}" + - "_MACHINE_TYPE=${_MACHINE_TYPE}" - "_REQUIREMENTS=${_REQUIREMENTS}" - "_SEED_CHECKPOINT=${_SEED_CHECKPOINT}" - "_CHECKPOINT_LOAD_PATH=${_CHECKPOINT_LOAD_PATH}" diff --git a/cloudbuild/macrobenchmarks/scripts/lib.sh b/cloudbuild/macrobenchmarks/scripts/lib.sh index 86d1faad..d2138b08 100755 --- a/cloudbuild/macrobenchmarks/scripts/lib.sh +++ b/cloudbuild/macrobenchmarks/scripts/lib.sh @@ -30,6 +30,21 @@ skip_if_failed() { fi } +shared_workload_helm_args() { + SHARED_HELM_ARGS=( + --set gcsfs.datasetPath="${_DATASET_PATH}" + --set workload.modelId="${_MODEL_ID}" + --set-string workload.image="${_IMAGE}" + --set workload.hfToken="${_HF_TOKEN}" + --set workload.nodes="${_NODES}" + --set workload.ranksPerNode="${_RANKS_PER_NODE}" + --set workload.requirements="${_REQUIREMENTS}" + --set workload.trainingStrategy="${_TRAINING_STRATEGY}" + --set "nodeSelector.cloud\.google\.com/gke-nodepool=${_MACHINE_TYPE}" + --set serviceAccount=default + ) +} + # Poll a JobSet until it reports Completed (return 0) or Failed/timeout (record # the failure in the ledger, dump diagnostics, return 1). Shared by the # seed-checkpoint and run-workload steps so the 240x30s poll lives in one place. diff --git a/cloudbuild/macrobenchmarks/scripts/run_workload.sh b/cloudbuild/macrobenchmarks/scripts/run_workload.sh index 54a9a3b4..cadbcee1 100755 --- a/cloudbuild/macrobenchmarks/scripts/run_workload.sh +++ b/cloudbuild/macrobenchmarks/scripts/run_workload.sh @@ -18,26 +18,18 @@ if [ -z "$EFFECTIVE_LOAD_PATH" ] && [ "${_SEED_CHECKPOINT}" = "true" ]; then EFFECTIVE_LOAD_PATH="${SEEDED_CKPT_PATH:-}" fi echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > /workspace/start_time.txt +shared_workload_helm_args helm install "$RUN_ID" "$CHART" -f "$CHART/values_base.yaml" \ - --set gcsfs.datasetPath="${_DATASET_PATH}" \ + "${SHARED_HELM_ARGS[@]}" \ --set gcsfs.ckptWritePath="gs://$CHECKPOINT_BUCKET/checkpoints" \ --set-string gcsfs.ckptLoadPath="${EFFECTIVE_LOAD_PATH}" \ - --set workload.modelId="${_MODEL_ID}" \ - --set-string workload.image="${_IMAGE}" \ - --set workload.hfToken="${_HF_TOKEN}" \ --set workload.steps="${_STEPS}" \ --set workload.ckptWriterInterval="${_CHECKPOINT_INTERVAL}" \ --set workload.ckptToKeep="${_CKPT_TO_KEEP}" \ - --set workload.nodes="${_NODES}" \ - --set workload.ranksPerNode="${_RANKS_PER_NODE}" \ --set workload.perDeviceBatch="${_PER_DEVICE_BATCH}" \ --set workload.gradAccum="${_GRAD_ACCUM}" \ --set workload.dataloaderWorkers="${_DATALOADER_WORKERS}" \ - --set workload.requirements="${_REQUIREMENTS}" \ - --set workload.trainingStrategy="${_TRAINING_STRATEGY}" \ - --set workload.simulatedStepComputeSeconds="${_SIMULATED_STEP_COMPUTE_SECONDS}" \ - --set "nodeSelector.cloud\.google\.com/gke-nodepool=${_MACHINE_TYPE}" \ - --set serviceAccount=default + --set workload.simulatedStepComputeSeconds="${_SIMULATED_STEP_COMPUTE_SECONDS}" if ! wait_for_jobset "$RUN_ID" run-workload; then exit 1 fi diff --git a/cloudbuild/macrobenchmarks/scripts/seed_checkpoint.sh b/cloudbuild/macrobenchmarks/scripts/seed_checkpoint.sh index a3d891cf..06798904 100755 --- a/cloudbuild/macrobenchmarks/scripts/seed_checkpoint.sh +++ b/cloudbuild/macrobenchmarks/scripts/seed_checkpoint.sh @@ -29,19 +29,14 @@ CHART="gcsfs/tests/perf/macrobenchmarks/workloads/${_WORKLOAD}/helm_chart" # eagerly-materialized AdamW state are serialized the same as in a long run. # simulatedStepComputeSeconds=0 makes the single step instant. echo "Installing seed release $SEED_RUN_ID to write one checkpoint to $SEED_CKPT_DIR ..." +shared_workload_helm_args helm install "$SEED_RUN_ID" "$CHART" -f "$CHART/values_base.yaml" \ - --set gcsfs.datasetPath="${_DATASET_PATH}" \ + "${SHARED_HELM_ARGS[@]}" \ --set gcsfs.ckptWritePath="$SEED_CKPT_DIR" \ --set-string gcsfs.ckptLoadPath="" \ - --set workload.modelId="${_MODEL_ID}" \ - --set workload.hfToken="${_HF_TOKEN}" \ --set workload.steps="1" \ --set workload.ckptWriterInterval="1" \ - --set workload.nodes="${_NODES}" \ - --set workload.requirements="${_REQUIREMENTS}" \ - --set workload.trainingStrategy="${_TRAINING_STRATEGY}" \ - --set workload.simulatedStepComputeSeconds="0" \ - --set serviceAccount=default + --set workload.simulatedStepComputeSeconds="0" if ! wait_for_jobset "$SEED_RUN_ID" seed-checkpoint; then helm uninstall "$SEED_RUN_ID" || true