Skip to content

Commit 37dcbcc

Browse files
committed
fix(tutorial): Fixed issues with multi-machine training environment variable settings
1 parent 1c356d7 commit 37dcbcc

1 file changed

Lines changed: 5 additions & 6 deletions

File tree

tutorial/example_deep_finance/deep_finance.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,16 @@ NUM_REPEAT=4 # group size,每个query rollout NUM_REPEAT次
2222
TRAIN_BATCH_SIZE=32 # 训练batchsize
2323
NUM_STEPS=6 # 每个样本step轮数
2424
DEEPFINANCE_TOOL_RESULT_MAX_CHARS=10000
25+
2526
# 主目录
2627
export AJET_ROOT="/mnt/data_cpfs/taoshuchang.tsc/deepresearch/AgentJet"
28+
29+
NNODES=${WORLD_SIZE}
30+
2731
# 涉密的配置(API_KEY以及模型、数据位置)从.env读取
2832
cd ${AJET_ROOT}
2933
source .venv/bin/activate
34+
3035
# API密钥配置 - 从 .env 文件加载
3136
ENV_FILE="${AJET_ROOT}/.env"
3237
if [ -f "$ENV_FILE" ]; then
@@ -112,12 +117,6 @@ ENV_SERVICE_LOG="${LOG_DIR}/env_service_${SUFFIX}_${CURRENT_TIME}.log"
112117
TRAIN_LOG="${LOG_DIR}/train_${SUFFIX}_${CURRENT_TIME}.log"
113118

114119
# 多机训练参数配置
115-
if [ -z "${WORLD_SIZE}" ]; then
116-
echo "ERROR: WORLD_SIZE environment variable is not set!"
117-
echo "Please ensure this script is run in a multi-node environment (e.g., PAI-DLC, SLURM)"
118-
exit 1
119-
fi
120-
NNODES=${WORLD_SIZE}
121120
GPUS_PER_NODE=8
122121
EXPECTED_WORKERS=$WORLD_SIZE
123122

0 commit comments

Comments
 (0)