您好,我用该仓库下的code进行了模型训练,首先我尝试了wan2.1-i2v-14v-480p的训练,以下是我的训练超参数:
export MASTER_ADDR="10.86.17.140"
export MASTER_PORT=29500
export WORLD_SIZE=3 # The number of machines
export NUM_PROCESS=24 # The number of processes, such as WORLD_SIZE * 8
export RANK=0 # The rank of this machine
export MODEL_NAME="/mmu_mllm_hdd_2/zuofei/model_param/Wan2.1-I2V-14B-480P"
#export DATASET_NAME="datasets/internal_datasets/"
export DATASET_META_NAME="/mmu_mllm_hdd_2/zuofei/VideoX-Fun/datasets/i2v_traing.json"
NCCL_IB_DISABLE=1 and NCCL_P2P_DISABLE=1 are used in multi nodes without RDMA.
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1
NCCL_DEBUG=INFO
accelerate launch --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT --num_machines=$WORLD_SIZE --num_processes=$NUM_PROCESS --machine_rank=$RANK --mixed_precision="bf16" --use_fsdp --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --fsdp_transformer_layer_cls_to_wrap=WanAttentionBlock --fsdp_sharding_strategy "FULL_SHARD" --fsdp_state_dict_type=SHARDED_STATE_DICT --fsdp_backward_prefetch "BACKWARD_PRE" --fsdp_cpu_ram_efficient_loading False scripts/wan2.1/train.py
--config_path="config/wan2.1/wan_civitai.yaml"
--pretrained_model_name_or_path=$MODEL_NAME
--train_data_dir=$DATASET_NAME
--train_data_meta=$DATASET_META_NAME
--image_sample_size=640
--video_sample_size=640
--token_sample_size=640
--video_sample_stride=2
--video_sample_n_frames=121
--train_batch_size=1
--video_repeat=1
--gradient_accumulation_steps=2
--dataloader_num_workers=8
--num_train_epochs=3
--checkpointing_steps=200
--learning_rate=2e-05
--lr_scheduler="constant_with_warmup"
--lr_warmup_steps=100
--seed=42
--output_dir="2.1_i2v_output_dir"
--gradient_checkpointing
--mixed_precision="bf16"
--adam_weight_decay=3e-2
--adam_epsilon=1e-10
--vae_mini_batch=1
--max_grad_norm=0.05
--random_hw_adapt
--training_with_video_token_length
--enable_bucket
--uniform_sampling
--low_vram
--train_mode="i2v"
--trainable_modules "."
我发现训练后的视频有很强的抖动感,明显特别不稳定:
训练前:
https://github.com/user-attachments/assets/a276b808-5c01-446a-a090-d57c8f83b1a6
训练后:
https://github.com/user-attachments/assets/b44b3b34-28f4-4ecf-b1a0-799142efd3d0
想问一下这是什么问题导致的呢?
您好,我用该仓库下的code进行了模型训练,首先我尝试了wan2.1-i2v-14v-480p的训练,以下是我的训练超参数:
export MASTER_ADDR="10.86.17.140"
export MASTER_PORT=29500
export WORLD_SIZE=3 # The number of machines
export NUM_PROCESS=24 # The number of processes, such as WORLD_SIZE * 8
export RANK=0 # The rank of this machine
export MODEL_NAME="/mmu_mllm_hdd_2/zuofei/model_param/Wan2.1-I2V-14B-480P"
#export DATASET_NAME="datasets/internal_datasets/"
export DATASET_META_NAME="/mmu_mllm_hdd_2/zuofei/VideoX-Fun/datasets/i2v_traing.json"
NCCL_IB_DISABLE=1 and NCCL_P2P_DISABLE=1 are used in multi nodes without RDMA.
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1
NCCL_DEBUG=INFO
accelerate launch --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT --num_machines=$WORLD_SIZE --num_processes=$NUM_PROCESS --machine_rank=$RANK --mixed_precision="bf16" --use_fsdp --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --fsdp_transformer_layer_cls_to_wrap=WanAttentionBlock --fsdp_sharding_strategy "FULL_SHARD" --fsdp_state_dict_type=SHARDED_STATE_DICT --fsdp_backward_prefetch "BACKWARD_PRE" --fsdp_cpu_ram_efficient_loading False scripts/wan2.1/train.py
--config_path="config/wan2.1/wan_civitai.yaml"
--pretrained_model_name_or_path=$MODEL_NAME
--train_data_dir=$DATASET_NAME
--train_data_meta=$DATASET_META_NAME
--image_sample_size=640
--video_sample_size=640
--token_sample_size=640
--video_sample_stride=2
--video_sample_n_frames=121
--train_batch_size=1
--video_repeat=1
--gradient_accumulation_steps=2
--dataloader_num_workers=8
--num_train_epochs=3
--checkpointing_steps=200
--learning_rate=2e-05
--lr_scheduler="constant_with_warmup"
--lr_warmup_steps=100
--seed=42
--output_dir="2.1_i2v_output_dir"
--gradient_checkpointing
--mixed_precision="bf16"
--adam_weight_decay=3e-2
--adam_epsilon=1e-10
--vae_mini_batch=1
--max_grad_norm=0.05
--random_hw_adapt
--training_with_video_token_length
--enable_bucket
--uniform_sampling
--low_vram
--train_mode="i2v"
--trainable_modules "."
我发现训练后的视频有很强的抖动感,明显特别不稳定:
训练前:
https://github.com/user-attachments/assets/a276b808-5c01-446a-a090-d57c8f83b1a6
训练后:
https://github.com/user-attachments/assets/b44b3b34-28f4-4ecf-b1a0-799142efd3d0
想问一下这是什么问题导致的呢?