Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ config/presets/test*.toml
tests/

huggingface/hub/models*
huggingface/hub/version_diffusers_cache.txt
huggingface/hub/version_diffusers_cache.txt
huggingface/
huggingface
57 changes: 57 additions & 0 deletions Example.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
model_train_type = "sdxl-lora"
pretrained_model_name_or_path = "/home/bluvoll/stable-diffusion-webui-reForge/models/Stable-diffusion/Chenkin-RF-0.3-Final-000001.safetensors"
train_data_dir = "/home/bluvoll/Escritorio/anime/kanojos/"
prior_loss_weight = 1
resolution = "1024,1024"
enable_bucket = true
min_bucket_reso = 256
max_bucket_reso = 2048
bucket_reso_steps = 64
bucket_no_upscale = true
output_name = "Kanojos-Aki-trainer"
output_dir = "./output"
save_model_as = "safetensors"
save_precision = "bf16"
save_every_n_epochs = 1
save_state = false
max_train_epochs = 30
train_batch_size = 12
gradient_checkpointing = true
gradient_accumulation_steps = 1
network_train_unet_only = true
network_train_text_encoder_only = false
learning_rate = 0.00004
unet_lr = 0.00004
text_encoder_lr = 0
lr_scheduler = "cosine_with_restarts"
lr_warmup_steps = 5
loss_type = "l2"
lr_scheduler_num_cycles = 2
optimizer_type = "pytorch_optimizer.CAME"
network_module = "lycoris.kohya"
network_dim = 32
network_alpha = 32
log_with = "tensorboard"
logging_dir = "./logs"
caption_extension = ".txt"
shuffle_caption = true
keep_tokens = 0
max_token_length = 255
caption_dropout_rate = 0.1
caption_tag_dropout_rate = 0.1
flow_model = true
flow_use_ot = true
flow_timestep_distribution = "uniform"
flow_uniform_static_ratio = 2
contrastive_flow_matching = false
cfm_lambda = 0.05
seed = 1337
mixed_precision = "bf16"
full_bf16 = true
xformers = true
lowram = false
cache_latents = true
cache_latents_to_disk = true
persistent_data_loader_workers = true
network_args = [ "conv_dim=32", "conv_alpha=32", "dropout=0", "algo=locon" ]
optimizer_args = [ "weight_decay=0.03" ]
4 changes: 2 additions & 2 deletions mikazuki/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ class TaggerInterrogateRequest(BaseModel):

class APIResponse(BaseModel):
status: str
message: Optional[str]
data: Optional[Dict]
message: Optional[str] = None
data: Optional[Dict] = None


class APIResponseSuccess(APIResponse):
Expand Down
34 changes: 34 additions & 0 deletions mikazuki/schema/dreambooth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,40 @@ Schema.intersect([
multires_noise_discount: Schema.number().step(0.1).description("多分辨率(金字塔)衰减率 推荐 0.3-0.8,须同时与上方参数 multires_noise_iterations 一同启用"),
}).description("噪声设置"),

// Rectified Flow 设置 (仅 SDXL 微调)
Schema.union([
Schema.object({
model_train_type: Schema.const("sdxl-finetune").required(),
}).extend(Schema.intersect([
Schema.object({
flow_model: Schema.boolean().default(false).description("启用 Rectified Flow 训练目标(用于 RF 模型微调)"),
}).description("Rectified Flow 设置"),

Schema.union([
Schema.object({
flow_model: Schema.const(true).required(),
flow_use_ot: Schema.boolean().default(false).description("使用余弦最优传输配对 latent 和噪声"),
flow_timestep_distribution: Schema.union(["logit_normal", "uniform"]).default("logit_normal").description("时间步采样分布"),
flow_uniform_static_ratio: Schema.number().step(0.1).description("固定的时间步偏移比率(例如 2),留空不使用"),
contrastive_flow_matching: Schema.boolean().default(false).description("启用对比流匹配 (ΔFM) 目标"),
cfm_lambda: Schema.number().step(0.01).default(0.05).description("ΔFM 损失中对比项的权重"),
}),
Schema.object({}),
]),

Schema.union([
Schema.object({
flow_model: Schema.const(true).required(),
flow_timestep_distribution: Schema.const("logit_normal").required(),
flow_logit_mean: Schema.number().step(0.1).default(0.0).description("logit-normal 分布的均值"),
flow_logit_std: Schema.number().step(0.1).default(1.0).description("logit-normal 分布的标准差"),
}),
Schema.object({}),
]),
])),
Schema.object({}),
]),

Schema.object({
seed: Schema.number().default(1337).description("随机种子"),
clip_skip: Schema.number().role("slider").min(0).max(12).step(1).default(2).description("CLIP 跳过层数 *玄学*"),
Expand Down
34 changes: 34 additions & 0 deletions mikazuki/schema/lora-master.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,40 @@ Schema.intersect([
// 噪声设置
SHARED_SCHEMAS.NOISE_SETTINGS,

// Rectified Flow 设置 (仅 SDXL LoRA)
Schema.union([
Schema.object({
model_train_type: Schema.const("sdxl-lora").required(),
}).extend(Schema.intersect([
Schema.object({
flow_model: Schema.boolean().default(false).description("启用 Rectified Flow 训练目标(用于 RF 模型微调)"),
}).description("Rectified Flow 设置"),

Schema.union([
Schema.object({
flow_model: Schema.const(true).required(),
flow_use_ot: Schema.boolean().default(false).description("使用余弦最优传输配对 latent 和噪声"),
flow_timestep_distribution: Schema.union(["logit_normal", "uniform"]).default("logit_normal").description("时间步采样分布"),
flow_uniform_static_ratio: Schema.number().step(0.1).description("固定的时间步偏移比率(例如 2),留空不使用"),
contrastive_flow_matching: Schema.boolean().default(false).description("启用对比流匹配 (ΔFM) 目标"),
cfm_lambda: Schema.number().step(0.01).default(0.05).description("ΔFM 损失中对比项的权重"),
}),
Schema.object({}),
]),

Schema.union([
Schema.object({
flow_model: Schema.const(true).required(),
flow_timestep_distribution: Schema.const("logit_normal").required(),
flow_logit_mean: Schema.number().step(0.1).default(0.0).description("logit-normal 分布的均值"),
flow_logit_std: Schema.number().step(0.1).default(1.0).description("logit-normal 分布的标准差"),
}),
Schema.object({}),
]),
])),
Schema.object({}),
]),

// 数据增强
SHARED_SCHEMAS.DATA_ENCHANCEMENT,

Expand Down
2 changes: 1 addition & 1 deletion mikazuki/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def get_total_images(path, recursive=True):


def fix_config_types(config: dict):
keep_float_params = ["guidance_scale", "sigmoid_scale", "discrete_flow_shift"]
keep_float_params = ["guidance_scale", "sigmoid_scale", "discrete_flow_shift", "flow_uniform_static_ratio", "flow_logit_mean", "flow_logit_std", "cfm_lambda"]
for k in keep_float_params:
if k in config:
config[k] = float(config[k])
22 changes: 11 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
accelerate==0.33.0
transformers==4.44.0
diffusers[torch]==0.25.0
accelerate==1.10.1
transformers==4.55.2
diffusers[torch]==0.35.1
ftfy==6.1.1
# albumentations==1.3.0
opencv-python==4.8.1.78
einops==0.7.0
pytorch-lightning==1.9.0
bitsandbytes==0.46.0
bitsandbytes==0.48.1
lion-pytorch==0.1.2
schedulefree==1.4
pytorch-optimizer==3.7.0
pytorch-optimizer==3.8.0
prodigy-plus-schedule-free==1.9.0
prodigyopt==1.1.2
tensorboard==2.10.1
safetensors==0.4.4
tensorboard==2.20.0
safetensors==0.6.2
prodigy-plus-schedule-free
# gradio==3.16.2
altair==4.2.2
easygui==0.98.3
toml==0.10.2
voluptuous==0.13.1
huggingface-hub==0.24.5
huggingface-hub==0.34.4
# for Image utils
imagesize==1.4.1
# for T5XXL tokenizer (SD3/FLUX)
Expand All @@ -34,9 +34,9 @@ pillow
numpy==1.26.4
# <=2.0.0
gradio==3.44.2
fastapi==0.95.1
uvicorn==0.22.0
wandb==0.16.2
fastapi==0.104.1
uvicorn==0.34.0
wandb==0.21.1
httpx==0.24.1
# extra
open-clip-torch==2.20.0
Expand Down
139 changes: 123 additions & 16 deletions scripts/stable/library/train_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5195,8 +5195,49 @@ def save_sd_model_on_train_end_common(
huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)


def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
def cosine_optimal_transport(X: torch.Tensor, Y: torch.Tensor, backend: str = "auto"):
"""Compute an optimal assignment under cosine distance."""

X_norm = X / torch.norm(X, dim=1, keepdim=True)
Y_norm = Y / torch.norm(Y, dim=1, keepdim=True)
cost = -torch.mm(X_norm, Y_norm.t())

if backend == "cuda":
return _cuda_assignment(cost)
if backend == "scipy":
return _scipy_assignment(cost)

try:
return _cuda_assignment(cost)
except (ImportError, RuntimeError):
return _scipy_assignment(cost)


def _cuda_assignment(cost: torch.Tensor):
from torch_linear_assignment import assignment_to_indices, batch_linear_assignment # type: ignore

assignment = batch_linear_assignment(cost.unsqueeze(0))
row_idx, col_idx = assignment_to_indices(assignment)
return cost, (row_idx, col_idx)


def _scipy_assignment(cost: torch.Tensor):
from scipy.optimize import linear_sum_assignment # type: ignore

cost_np = cost.to(torch.float32).detach().cpu().numpy()
row_ind, col_ind = linear_sum_assignment(cost_np)
row = torch.from_numpy(row_ind).to(cost.device, torch.long)
col = torch.from_numpy(col_ind).to(cost.device, torch.long)
return cost, (row, col)


def get_timesteps_and_huber_c(
args, min_timestep, max_timestep, noise_scheduler, b_size, device, timesteps_override=None
):
if timesteps_override is not None:
timesteps = timesteps_override.to("cpu")
else:
timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")

if args.loss_type == "huber" or args.loss_type == "smooth_l1":
if args.huber_schedule == "exponential":
Expand All @@ -5220,7 +5261,13 @@ def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler,
return timesteps, huber_c


def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
def get_noise_noisy_latents_and_timesteps(
args,
noise_scheduler,
latents,
pre_sampled_timesteps=None,
pixel_counts=None,
):
# Sample noise that we'll add to the latents
noise = torch.randn_like(latents, device=latents.device)
if args.noise_offset:
Expand All @@ -5234,23 +5281,83 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount
)

# Sample a random timestep for each image
b_size = latents.shape[0]
min_timestep = 0 if args.min_timestep is None else args.min_timestep
max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
flow_model_enabled = getattr(args, "flow_model", False)

if flow_model_enabled:
timestep_max = noise_scheduler.config.num_train_timesteps
distribution = getattr(args, "flow_timestep_distribution", "logit_normal")
if distribution == "logit_normal":
logits = torch.normal(
mean=getattr(args, "flow_logit_mean", 0.0),
std=getattr(args, "flow_logit_std", 1.0),
size=(b_size,),
device=latents.device,
)
sigmas = torch.sigmoid(logits)
elif distribution == "uniform":
sigmas = torch.rand((b_size,), device=latents.device)
else:
raise ValueError(f"Unknown flow_timestep_distribution: {distribution}")

timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
shift_requested = (
getattr(args, "flow_uniform_shift", False) or getattr(args, "flow_uniform_static_ratio", None) is not None
)
if sigmas is not None and shift_requested:
static_ratio = getattr(args, "flow_uniform_static_ratio", None)
if static_ratio is not None:
if static_ratio <= 0:
raise ValueError("`flow_uniform_static_ratio` must be positive when used.")
ratios = torch.full((b_size,), float(static_ratio), device=latents.device, dtype=torch.float32)
else:
if pixel_counts is None:
raise ValueError("Resolution-dependent Rectified Flow shift requires pixel_counts.")
base_pixels = getattr(args, "flow_uniform_base_pixels", None)
if base_pixels is None or base_pixels <= 0:
raise ValueError("`flow_uniform_base_pixels` must be positive when using flow_uniform_shift.")
ratios = torch.sqrt(
torch.as_tensor(pixel_counts, device=latents.device, dtype=torch.float32) / float(base_pixels)
)

# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
if args.ip_noise_gamma:
if args.ip_noise_gamma_random_strength:
strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
else:
strength = args.ip_noise_gamma
noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
t_ref = sigmas
sigmas = ratios * t_ref / (1 + (ratios - 1) * t_ref)

timesteps = torch.clamp((sigmas * timestep_max).long(), 0, timestep_max - 1)
_, huber_c = get_timesteps_and_huber_c(
args,
0,
noise_scheduler.config.num_train_timesteps,
noise_scheduler,
b_size,
latents.device,
timesteps_override=timesteps,
)
else:
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
min_timestep = 0 if args.min_timestep is None else args.min_timestep
max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)

if flow_model_enabled:
if getattr(args, "flow_use_ot", False) and b_size > 1:
with torch.no_grad():
lat_flat = latents.view(b_size, -1)
noise_flat = noise.view(b_size, -1)
_, (_, col_indices) = cosine_optimal_transport(lat_flat, noise_flat)
noise = noise[col_indices.squeeze(0)]

sigmas_view = sigmas.view(-1, 1, 1, 1)
noisy_latents = sigmas_view * noise + (1.0 - sigmas_view) * latents
else:
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
if args.ip_noise_gamma:
if args.ip_noise_gamma_random_strength:
strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
else:
strength = args.ip_noise_gamma
noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
else:
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

return noise, noisy_latents, timesteps, huber_c

Expand Down
Loading