Akegarasu · bluvoll · Mar 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,6 @@ config/presets/test*.toml
 tests/
 
 huggingface/hub/models*
-huggingface/hub/version_diffusers_cache.txt
+huggingface/hub/version_diffusers_cache.txt
+huggingface/
+huggingface
diff --git a/Example.toml b/Example.toml
@@ -0,0 +1,57 @@
+model_train_type = "sdxl-lora"
+pretrained_model_name_or_path = "/home/bluvoll/stable-diffusion-webui-reForge/models/Stable-diffusion/Chenkin-RF-0.3-Final-000001.safetensors"
+train_data_dir = "/home/bluvoll/Escritorio/anime/kanojos/"
+prior_loss_weight = 1
+resolution = "1024,1024"
+enable_bucket = true
+min_bucket_reso = 256
+max_bucket_reso = 2048
+bucket_reso_steps = 64
+bucket_no_upscale = true
+output_name = "Kanojos-Aki-trainer"
+output_dir = "./output"
+save_model_as = "safetensors"
+save_precision = "bf16"
+save_every_n_epochs = 1
+save_state = false
+max_train_epochs = 30
+train_batch_size = 12
+gradient_checkpointing = true
+gradient_accumulation_steps = 1
+network_train_unet_only = true
+network_train_text_encoder_only = false
+learning_rate = 0.00004
+unet_lr = 0.00004
+text_encoder_lr = 0
+lr_scheduler = "cosine_with_restarts"
+lr_warmup_steps = 5
+loss_type = "l2"
+lr_scheduler_num_cycles = 2
+optimizer_type = "pytorch_optimizer.CAME"
+network_module = "lycoris.kohya"
+network_dim = 32
+network_alpha = 32
+log_with = "tensorboard"
+logging_dir = "./logs"
+caption_extension = ".txt"
+shuffle_caption = true
+keep_tokens = 0
+max_token_length = 255
+caption_dropout_rate = 0.1
+caption_tag_dropout_rate = 0.1
+flow_model = true
+flow_use_ot = true
+flow_timestep_distribution = "uniform"
+flow_uniform_static_ratio = 2
+contrastive_flow_matching = false
+cfm_lambda = 0.05
+seed = 1337
+mixed_precision = "bf16"
+full_bf16 = true
+xformers = true
+lowram = false
+cache_latents = true
+cache_latents_to_disk = true
+persistent_data_loader_workers = true
+network_args = [ "conv_dim=32", "conv_alpha=32", "dropout=0", "algo=locon" ]
+optimizer_args = [ "weight_decay=0.03" ]
diff --git a/mikazuki/app/models.py b/mikazuki/app/models.py
@@ -32,8 +32,8 @@ class TaggerInterrogateRequest(BaseModel):
 
 class APIResponse(BaseModel):
     status: str
-    message: Optional[str]
-    data: Optional[Dict]
+    message: Optional[str] = None
+    data: Optional[Dict] = None
 
 
 class APIResponseSuccess(APIResponse):

diff --git a/mikazuki/schema/dreambooth.ts b/mikazuki/schema/dreambooth.ts
@@ -184,6 +184,40 @@ Schema.intersect([
         multires_noise_discount: Schema.number().step(0.1).description("多分辨率（金字塔）衰减率 推荐 0.3-0.8，须同时与上方参数 multires_noise_iterations 一同启用"),
     }).description("噪声设置"),
 
+    // Rectified Flow 设置 (仅 SDXL 微调)
+    Schema.union([
+        Schema.object({
+            model_train_type: Schema.const("sdxl-finetune").required(),
+        }).extend(Schema.intersect([
+            Schema.object({
+                flow_model: Schema.boolean().default(false).description("启用 Rectified Flow 训练目标（用于 RF 模型微调）"),
+            }).description("Rectified Flow 设置"),
+
+            Schema.union([
+                Schema.object({
+                    flow_model: Schema.const(true).required(),
+                    flow_use_ot: Schema.boolean().default(false).description("使用余弦最优传输配对 latent 和噪声"),
+                    flow_timestep_distribution: Schema.union(["logit_normal", "uniform"]).default("logit_normal").description("时间步采样分布"),
+                    flow_uniform_static_ratio: Schema.number().step(0.1).description("固定的时间步偏移比率（例如 2），留空不使用"),
+                    contrastive_flow_matching: Schema.boolean().default(false).description("启用对比流匹配 (ΔFM) 目标"),
+                    cfm_lambda: Schema.number().step(0.01).default(0.05).description("ΔFM 损失中对比项的权重"),
+                }),
+                Schema.object({}),
+            ]),
+
+            Schema.union([
+                Schema.object({
+                    flow_model: Schema.const(true).required(),
+                    flow_timestep_distribution: Schema.const("logit_normal").required(),
+                    flow_logit_mean: Schema.number().step(0.1).default(0.0).description("logit-normal 分布的均值"),
+                    flow_logit_std: Schema.number().step(0.1).default(1.0).description("logit-normal 分布的标准差"),
+                }),
+                Schema.object({}),
+            ]),
+        ])),
+        Schema.object({}),
+    ]),
+
     Schema.object({
         seed: Schema.number().default(1337).description("随机种子"),
         clip_skip: Schema.number().role("slider").min(0).max(12).step(1).default(2).description("CLIP 跳过层数 *玄学*"),

diff --git a/mikazuki/schema/lora-master.ts b/mikazuki/schema/lora-master.ts
@@ -82,6 +82,40 @@ Schema.intersect([
     // 噪声设置
     SHARED_SCHEMAS.NOISE_SETTINGS,
 
+    // Rectified Flow 设置 (仅 SDXL LoRA)
+    Schema.union([
+        Schema.object({
+            model_train_type: Schema.const("sdxl-lora").required(),
+        }).extend(Schema.intersect([
+            Schema.object({
+                flow_model: Schema.boolean().default(false).description("启用 Rectified Flow 训练目标（用于 RF 模型微调）"),
+            }).description("Rectified Flow 设置"),
+
+            Schema.union([
+                Schema.object({
+                    flow_model: Schema.const(true).required(),
+                    flow_use_ot: Schema.boolean().default(false).description("使用余弦最优传输配对 latent 和噪声"),
+                    flow_timestep_distribution: Schema.union(["logit_normal", "uniform"]).default("logit_normal").description("时间步采样分布"),
+                    flow_uniform_static_ratio: Schema.number().step(0.1).description("固定的时间步偏移比率（例如 2），留空不使用"),
+                    contrastive_flow_matching: Schema.boolean().default(false).description("启用对比流匹配 (ΔFM) 目标"),
+                    cfm_lambda: Schema.number().step(0.01).default(0.05).description("ΔFM 损失中对比项的权重"),
+                }),
+                Schema.object({}),
+            ]),
+
+            Schema.union([
+                Schema.object({
+                    flow_model: Schema.const(true).required(),
+                    flow_timestep_distribution: Schema.const("logit_normal").required(),
+                    flow_logit_mean: Schema.number().step(0.1).default(0.0).description("logit-normal 分布的均值"),
+                    flow_logit_std: Schema.number().step(0.1).default(1.0).description("logit-normal 分布的标准差"),
+                }),
+                Schema.object({}),
+            ]),
+        ])),
+        Schema.object({}),
+    ]),
+
     // 数据增强
     SHARED_SCHEMAS.DATA_ENCHANCEMENT,
 

diff --git a/mikazuki/utils/train_utils.py b/mikazuki/utils/train_utils.py
@@ -252,7 +252,7 @@ def get_total_images(path, recursive=True):
 
 
 def fix_config_types(config: dict):
-    keep_float_params = ["guidance_scale", "sigmoid_scale", "discrete_flow_shift"]
+    keep_float_params = ["guidance_scale", "sigmoid_scale", "discrete_flow_shift", "flow_uniform_static_ratio", "flow_logit_mean", "flow_logit_std", "cfm_lambda"]
     for k in keep_float_params:
         if k in config:
             config[k] = float(config[k])
diff --git a/requirements.txt b/requirements.txt
@@ -1,26 +1,26 @@
-accelerate==0.33.0
-transformers==4.44.0
-diffusers[torch]==0.25.0
+accelerate==1.10.1
+transformers==4.55.2
+diffusers[torch]==0.35.1
 ftfy==6.1.1
 # albumentations==1.3.0
 opencv-python==4.8.1.78
 einops==0.7.0
 pytorch-lightning==1.9.0
-bitsandbytes==0.46.0
+bitsandbytes==0.48.1
 lion-pytorch==0.1.2
 schedulefree==1.4
-pytorch-optimizer==3.7.0
+pytorch-optimizer==3.8.0
 prodigy-plus-schedule-free==1.9.0
 prodigyopt==1.1.2
-tensorboard==2.10.1
-safetensors==0.4.4
+tensorboard==2.20.0
+safetensors==0.6.2
 prodigy-plus-schedule-free
 # gradio==3.16.2
 altair==4.2.2
 easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
-huggingface-hub==0.24.5
+huggingface-hub==0.34.4
 # for Image utils
 imagesize==1.4.1
 # for T5XXL tokenizer (SD3/FLUX)
@@ -34,9 +34,9 @@ pillow
 numpy==1.26.4
 # <=2.0.0
 gradio==3.44.2
-fastapi==0.95.1
-uvicorn==0.22.0
-wandb==0.16.2
+fastapi==0.104.1
+uvicorn==0.34.0
+wandb==0.21.1
 httpx==0.24.1
 # extra
 open-clip-torch==2.20.0

diff --git a/scripts/stable/library/train_util.py b/scripts/stable/library/train_util.py
@@ -5195,8 +5195,49 @@ def save_sd_model_on_train_end_common(
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
 
-def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
+def cosine_optimal_transport(X: torch.Tensor, Y: torch.Tensor, backend: str = "auto"):
+    """Compute an optimal assignment under cosine distance."""
+
+    X_norm = X / torch.norm(X, dim=1, keepdim=True)
+    Y_norm = Y / torch.norm(Y, dim=1, keepdim=True)
+    cost = -torch.mm(X_norm, Y_norm.t())
+
+    if backend == "cuda":
+        return _cuda_assignment(cost)
+    if backend == "scipy":
+        return _scipy_assignment(cost)
+
+    try:
+        return _cuda_assignment(cost)
+    except (ImportError, RuntimeError):
+        return _scipy_assignment(cost)
+
+
+def _cuda_assignment(cost: torch.Tensor):
+    from torch_linear_assignment import assignment_to_indices, batch_linear_assignment  # type: ignore
+
+    assignment = batch_linear_assignment(cost.unsqueeze(0))
+    row_idx, col_idx = assignment_to_indices(assignment)
+    return cost, (row_idx, col_idx)
+
+
+def _scipy_assignment(cost: torch.Tensor):
+    from scipy.optimize import linear_sum_assignment  # type: ignore
+
+    cost_np = cost.to(torch.float32).detach().cpu().numpy()
+    row_ind, col_ind = linear_sum_assignment(cost_np)
+    row = torch.from_numpy(row_ind).to(cost.device, torch.long)
+    col = torch.from_numpy(col_ind).to(cost.device, torch.long)
+    return cost, (row, col)
+
+
+def get_timesteps_and_huber_c(
+    args, min_timestep, max_timestep, noise_scheduler, b_size, device, timesteps_override=None
+):
+    if timesteps_override is not None:
+        timesteps = timesteps_override.to("cpu")
+    else:
+        timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
 
     if args.loss_type == "huber" or args.loss_type == "smooth_l1":
         if args.huber_schedule == "exponential":
@@ -5220,7 +5261,13 @@ def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler,
     return timesteps, huber_c
 
 
-def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
+def get_noise_noisy_latents_and_timesteps(
+    args,
+    noise_scheduler,
+    latents,
+    pre_sampled_timesteps=None,
+    pixel_counts=None,
+):
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
     if args.noise_offset:
@@ -5234,23 +5281,83 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
             noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount
         )
 
-    # Sample a random timestep for each image
     b_size = latents.shape[0]
-    min_timestep = 0 if args.min_timestep is None else args.min_timestep
-    max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
+    flow_model_enabled = getattr(args, "flow_model", False)
+
+    if flow_model_enabled:
+        timestep_max = noise_scheduler.config.num_train_timesteps
+        distribution = getattr(args, "flow_timestep_distribution", "logit_normal")
+        if distribution == "logit_normal":
+            logits = torch.normal(
+                mean=getattr(args, "flow_logit_mean", 0.0),
+                std=getattr(args, "flow_logit_std", 1.0),
+                size=(b_size,),
+                device=latents.device,
+            )
+            sigmas = torch.sigmoid(logits)
+        elif distribution == "uniform":
+            sigmas = torch.rand((b_size,), device=latents.device)
+        else:
+            raise ValueError(f"Unknown flow_timestep_distribution: {distribution}")
 
-    timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
+        shift_requested = (
+            getattr(args, "flow_uniform_shift", False) or getattr(args, "flow_uniform_static_ratio", None) is not None
+        )
+        if sigmas is not None and shift_requested:
+            static_ratio = getattr(args, "flow_uniform_static_ratio", None)
+            if static_ratio is not None:
+                if static_ratio <= 0:
+                    raise ValueError("`flow_uniform_static_ratio` must be positive when used.")
+                ratios = torch.full((b_size,), float(static_ratio), device=latents.device, dtype=torch.float32)
+            else:
+                if pixel_counts is None:
+                    raise ValueError("Resolution-dependent Rectified Flow shift requires pixel_counts.")
+                base_pixels = getattr(args, "flow_uniform_base_pixels", None)
+                if base_pixels is None or base_pixels <= 0:
+                    raise ValueError("`flow_uniform_base_pixels` must be positive when using flow_uniform_shift.")
+                ratios = torch.sqrt(
+                    torch.as_tensor(pixel_counts, device=latents.device, dtype=torch.float32) / float(base_pixels)
+                )
 
-    # Add noise to the latents according to the noise magnitude at each timestep
-    # (this is the forward diffusion process)
-    if args.ip_noise_gamma:
-        if args.ip_noise_gamma_random_strength:
-            strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
-        else:
-            strength = args.ip_noise_gamma
-        noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
+            t_ref = sigmas
+            sigmas = ratios * t_ref / (1 + (ratios - 1) * t_ref)
+
+        timesteps = torch.clamp((sigmas * timestep_max).long(), 0, timestep_max - 1)
+        _, huber_c = get_timesteps_and_huber_c(
+            args,
+            0,
+            noise_scheduler.config.num_train_timesteps,
+            noise_scheduler,
+            b_size,
+            latents.device,
+            timesteps_override=timesteps,
+        )
     else:
-        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+        min_timestep = 0 if args.min_timestep is None else args.min_timestep
+        max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
+        timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
+
+    if flow_model_enabled:
+        if getattr(args, "flow_use_ot", False) and b_size > 1:
+            with torch.no_grad():
+                lat_flat = latents.view(b_size, -1)
+                noise_flat = noise.view(b_size, -1)
+                _, (_, col_indices) = cosine_optimal_transport(lat_flat, noise_flat)
+                noise = noise[col_indices.squeeze(0)]
+
+        sigmas_view = sigmas.view(-1, 1, 1, 1)
+        noisy_latents = sigmas_view * noise + (1.0 - sigmas_view) * latents
+    else:
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        if args.ip_noise_gamma:
+            if args.ip_noise_gamma_random_strength:
+                strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+            else:
+                strength = args.ip_noise_gamma
+            noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
+        else:
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
     return noise, noisy_latents, timesteps, huber_c