Support Anima model (#2260)

* Support Anima model * Update document and fix bug * Fix latent normlization * Fix typo * Fix cache embedding * fix typo in tests/test_anima_cache.py * Remove redundant argument apply_t5_attn_mask * Improving caching with argument caption_dropout_rate * Fix W&B logging bugs * Fix discrete_flow_shift default value
2026-04-06 13:47:06 +00:00 · 2026-02-08 08:18:55 +07:00
parent b996440c5f
commit e21a7736f8
21 changed files with 462100 additions and 3 deletions
--- a/anima_train.py
+++ b/anima_train.py
@@ -0,0 +1,887 @@
 # Anima full finetune training script
 import argparse
 from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
 import os
 from multiprocessing import Value
 from typing import List
 import toml
 from tqdm import tqdm
 import torch
 from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 init_ipex()
 from accelerate.utils import set_seed
 from library import deepspeed_utils, anima_models, anima_train_utils, anima_utils, strategy_base, strategy_anima, sai_model_spec
 import library.train_util as train_util
 from library.utils import setup_logging, add_logging_arguments
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 import library.config_util as config_util
 from library.config_util import (
    ConfigSanitizer,
    BlueprintGenerator,
 )
 from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
 def train(args):
    train_util.verify_training_args(args)
    train_util.prepare_dataset_args(args, True)
    deepspeed_utils.prepare_deepspeed_args(args)
    setup_logging(args, reset=True)
    # backward compatibility
    if not args.skip_cache_check:
        args.skip_cache_check = args.skip_latents_validity_check
    if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
        logger.warning(
            "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled"
        )
        args.cache_text_encoder_outputs = True
    if args.cpu_offload_checkpointing and not args.gradient_checkpointing:
        logger.warning("cpu_offload_checkpointing is enabled, so gradient_checkpointing is also enabled")
        args.gradient_checkpointing = True
    if getattr(args, 'unsloth_offload_checkpointing', False):
        if not args.gradient_checkpointing:
            logger.warning("unsloth_offload_checkpointing is enabled, so gradient_checkpointing is also enabled")
            args.gradient_checkpointing = True
        assert not args.cpu_offload_checkpointing, \
            "Cannot use both --unsloth_offload_checkpointing and --cpu_offload_checkpointing"
    assert (
        args.blocks_to_swap is None or args.blocks_to_swap == 0
    ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing"
    assert (
        args.blocks_to_swap is None or args.blocks_to_swap == 0
    ) or not getattr(args, 'unsloth_offload_checkpointing', False), \
        "blocks_to_swap is not supported with unsloth_offload_checkpointing"
    # Flash attention: validate availability
    if getattr(args, 'flash_attn', False):
        try:
            import flash_attn  # noqa: F401
            logger.info("Flash Attention enabled for DiT blocks")
        except ImportError:
            logger.warning("flash_attn package not installed, falling back to PyTorch SDPA")
            args.flash_attn = False
    cache_latents = args.cache_latents
    use_dreambooth_method = args.in_json is None
    if args.seed is not None:
        set_seed(args.seed)
    # prepare caching strategy: must be set before preparing dataset
    if args.cache_latents:
        latents_caching_strategy = strategy_anima.AnimaLatentsCachingStrategy(
            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
        )
        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
    # prepare dataset
    if args.dataset_class is None:
        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
        if args.dataset_config is not None:
            logger.info(f"Load dataset config from {args.dataset_config}")
            user_config = config_util.load_user_config(args.dataset_config)
            ignored = ["train_data_dir", "in_json"]
            if any(getattr(args, attr) is not None for attr in ignored):
                logger.warning(
                    "ignore following options because config file is found: {0}".format(", ".join(ignored))
                )
        else:
            if use_dreambooth_method:
                logger.info("Using DreamBooth method.")
                user_config = {
                    "datasets": [
                        {
                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
                                args.train_data_dir, args.reg_data_dir
                            )
                        }
                    ]
                }
            else:
                logger.info("Training with captions.")
                user_config = {
                    "datasets": [
                        {
                            "subsets": [
                                {
                                    "image_dir": args.train_data_dir,
                                    "metadata_file": args.in_json,
                                }
                            ]
                        }
                    ]
                }
        blueprint = blueprint_generator.generate(user_config, args)
        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
    else:
        train_dataset_group = train_util.load_arbitrary_dataset(args)
        val_dataset_group = None
    current_epoch = Value("i", 0)
    current_step = Value("i", 0)
    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
    train_dataset_group.verify_bucket_reso_steps(8)  # WanVAE spatial downscale = 8
    # Anima uses embedding-level dropout (in AnimaTextEncodingStrategy) instead of
    # dataset-level caption dropout, so we save the rate and zero out subset-level
    # caption_dropout_rate to allow text encoder output caching.
    caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
    if caption_dropout_rate > 0:
        logger.info(f"Using embedding-level caption dropout rate: {caption_dropout_rate}")
        for dataset in train_dataset_group.datasets:
            for subset in dataset.subsets:
                subset.caption_dropout_rate = 0.0
    if args.debug_dataset:
        if args.cache_text_encoder_outputs:
            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
                strategy_anima.AnimaTextEncoderOutputsCachingStrategy(
                    args.cache_text_encoder_outputs_to_disk,
                    args.text_encoder_batch_size,
                    False,
                    False,
                )
            )
        train_dataset_group.set_current_strategies()
        train_util.debug_dataset(train_dataset_group, True)
        return
    if len(train_dataset_group) == 0:
        logger.error("No data found. Please verify the metadata file and train_data_dir option.")
        return
    if cache_latents:
        assert (
            train_dataset_group.is_latent_cacheable()
        ), "when caching latents, either color_aug or random_crop cannot be used"
    if args.cache_text_encoder_outputs:
        assert (
            train_dataset_group.is_text_encoder_output_cacheable()
        ), "when caching text encoder output, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used"
    # prepare accelerator
    logger.info("prepare accelerator")
    accelerator = train_util.prepare_accelerator(args)
    # mixed precision dtype
    weight_dtype, save_dtype = train_util.prepare_dtype(args)
    # parse transformer_dtype
    transformer_dtype = None
    if hasattr(args, 'transformer_dtype') and args.transformer_dtype is not None:
        transformer_dtype_map = {
            "float16": torch.float16,
            "bfloat16": torch.bfloat16,
            "float32": torch.float32,
        }
        transformer_dtype = transformer_dtype_map.get(args.transformer_dtype, None)
    # Load tokenizers and set strategies
    logger.info("Loading tokenizers...")
    qwen3_text_encoder, qwen3_tokenizer = anima_utils.load_qwen3_text_encoder(
        args.qwen3_path, dtype=weight_dtype, device="cpu"
    )
    t5_tokenizer = anima_utils.load_t5_tokenizer(
        getattr(args, 't5_tokenizer_path', None)
    )
    # Set tokenize strategy
    tokenize_strategy = strategy_anima.AnimaTokenizeStrategy(
        qwen3_tokenizer=qwen3_tokenizer,
        t5_tokenizer=t5_tokenizer,
        qwen3_max_length=args.qwen3_max_token_length,
        t5_max_length=args.t5_max_token_length,
    )
    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
    # Set text encoding strategy
    caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
    text_encoding_strategy = strategy_anima.AnimaTextEncodingStrategy(
        dropout_rate=caption_dropout_rate,
    )
    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
    # Prepare text encoder (always frozen for Anima)
    qwen3_text_encoder.to(weight_dtype)
    qwen3_text_encoder.requires_grad_(False)
    # Cache text encoder outputs
    sample_prompts_te_outputs = None
    if args.cache_text_encoder_outputs:
        qwen3_text_encoder.to(accelerator.device)
        qwen3_text_encoder.eval()
        text_encoder_caching_strategy = strategy_anima.AnimaTextEncoderOutputsCachingStrategy(
            args.cache_text_encoder_outputs_to_disk,
            args.text_encoder_batch_size,
            args.skip_cache_check,
            is_partial=False,
        )
        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
        with accelerator.autocast():
            train_dataset_group.new_cache_text_encoder_outputs([qwen3_text_encoder], accelerator)
        # cache sample prompt embeddings
        if args.sample_prompts is not None:
            logger.info(f"Cache Text Encoder outputs for sample prompts: {args.sample_prompts}")
            prompts = train_util.load_prompts(args.sample_prompts)
            sample_prompts_te_outputs = {}
            with accelerator.autocast(), torch.no_grad():
                for prompt_dict in prompts:
                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
                        if p not in sample_prompts_te_outputs:
                            logger.info(f"  cache TE outputs for: {p}")
                            tokens_and_masks = tokenize_strategy.tokenize(p)
                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
                                tokenize_strategy,
                                [qwen3_text_encoder],
                                tokens_and_masks,
                                enable_dropout=False,
                            )
        # Pre-cache unconditional embeddings for caption dropout before text encoder is deleted
        caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
        if caption_dropout_rate > 0.0:
            with accelerator.autocast():
                text_encoding_strategy.cache_uncond_embeddings(tokenize_strategy, [qwen3_text_encoder])
        accelerator.wait_for_everyone()
        # free text encoder memory
        qwen3_text_encoder = None
        clean_memory_on_device(accelerator.device)
    # Load VAE and cache latents
    logger.info("Loading Anima VAE...")
    vae, vae_mean, vae_std, vae_scale = anima_utils.load_anima_vae(args.vae_path, dtype=weight_dtype, device="cpu")
    if cache_latents:
        vae.to(accelerator.device, dtype=weight_dtype)
        vae.requires_grad_(False)
        vae.eval()
        train_dataset_group.new_cache_latents(vae, accelerator)
        vae.to("cpu")
        clean_memory_on_device(accelerator.device)
        accelerator.wait_for_everyone()
    # Load DiT (MiniTrainDIT + optional LLM Adapter)
    logger.info("Loading Anima DiT...")
    dit = anima_utils.load_anima_dit(
        args.dit_path,
        dtype=weight_dtype,
        device="cpu",
        transformer_dtype=transformer_dtype,
        llm_adapter_path=getattr(args, 'llm_adapter_path', None),
        disable_mmap=getattr(args, 'disable_mmap_load_safetensors', False),
    )
    if args.gradient_checkpointing:
        dit.enable_gradient_checkpointing(
            cpu_offload=args.cpu_offload_checkpointing,
            unsloth_offload=getattr(args, 'unsloth_offload_checkpointing', False),
        )
    if getattr(args, 'flash_attn', False):
        dit.set_flash_attn(True)
    train_dit = args.learning_rate != 0
    dit.requires_grad_(train_dit)
    if not train_dit:
        dit.to(accelerator.device, dtype=weight_dtype)
    # Block swap
    is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
    if is_swapping_blocks:
        logger.info(f"Enable block swap: blocks_to_swap={args.blocks_to_swap}")
        dit.enable_block_swap(args.blocks_to_swap, accelerator.device)
    if not cache_latents:
        vae.requires_grad_(False)
        vae.eval()
        vae.to(accelerator.device, dtype=weight_dtype)
        # Move scale tensors to same device as VAE for on-the-fly encoding
        vae_scale = [s.to(accelerator.device) if isinstance(s, torch.Tensor) else s for s in vae_scale]
    # Setup optimizer with parameter groups
    if train_dit:
        param_groups = anima_train_utils.get_anima_param_groups(
            dit,
            base_lr=args.learning_rate,
            self_attn_lr=getattr(args, 'self_attn_lr', None),
            cross_attn_lr=getattr(args, 'cross_attn_lr', None),
            mlp_lr=getattr(args, 'mlp_lr', None),
            mod_lr=getattr(args, 'mod_lr', None),
            llm_adapter_lr=getattr(args, 'llm_adapter_lr', None),
        )
    else:
        param_groups = []
    training_models = []
    if train_dit:
        training_models.append(dit)
    # calculate trainable parameters
    n_params = 0
    for group in param_groups:
        for p in group["params"]:
            n_params += p.numel()
    accelerator.print(f"train dit: {train_dit}")
    accelerator.print(f"number of training models: {len(training_models)}")
    accelerator.print(f"number of trainable parameters: {n_params:,}")
    # prepare optimizer
    accelerator.print("prepare optimizer, data loader etc.")
    if args.blockwise_fused_optimizers:
        # Split params into per-block groups for blockwise fused optimizer
        # Build param_id → lr mapping from param_groups to propagate per-component LRs
        param_lr_map = {}
        for group in param_groups:
            for p in group['params']:
                param_lr_map[id(p)] = group['lr']
        grouped_params = []
        param_group = {}
        named_parameters = list(dit.named_parameters())
        for name, p in named_parameters:
            if not p.requires_grad:
                continue
            # Determine block type and index
            if name.startswith("blocks."):
                block_index = int(name.split(".")[1])
                block_type = "blocks"
            elif name.startswith("llm_adapter.blocks."):
                block_index = int(name.split(".")[2])
                block_type = "llm_adapter"
            else:
                block_index = -1
                block_type = "other"
            param_group_key = (block_type, block_index)
            if param_group_key not in param_group:
                param_group[param_group_key] = []
            param_group[param_group_key].append(p)
        for param_group_key, params in param_group.items():
            # Use per-component LR from param_groups if available
            lr = param_lr_map.get(id(params[0]), args.learning_rate)
            grouped_params.append({"params": params, "lr": lr})
            num_params = sum(p.numel() for p in params)
            accelerator.print(f"block {param_group_key}: {num_params} parameters, lr={lr}")
        # Create per-group optimizers
        optimizers = []
        for group in grouped_params:
            _, _, opt = train_util.get_optimizer(args, trainable_params=[group])
            optimizers.append(opt)
        optimizer = optimizers[0]  # avoid error in following code
        logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
        if train_util.is_schedulefree_optimizer(optimizers[0], args):
            raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
        optimizer_train_fn = lambda: None
        optimizer_eval_fn = lambda: None
    elif args.fused_backward_pass:
        # Pass per-component param_groups directly to preserve per-component LRs
        _, _, optimizer = train_util.get_optimizer(args, trainable_params=param_groups)
        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
    else:
        _, _, optimizer = train_util.get_optimizer(args, trainable_params=param_groups)
        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
    # prepare dataloader
    train_dataset_group.set_current_strategies()
    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset_group,
        batch_size=1,
        shuffle=True,
        collate_fn=collator,
        num_workers=n_workers,
        persistent_workers=args.persistent_data_loader_workers,
    )
    # calculate training steps
    if args.max_train_epochs is not None:
        args.max_train_steps = args.max_train_epochs * math.ceil(
            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
        )
        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs: {args.max_train_steps}")
    train_dataset_group.set_max_train_steps(args.max_train_steps)
    # lr scheduler
    if args.blockwise_fused_optimizers:
        lr_schedulers = [train_util.get_scheduler_fix(args, opt, accelerator.num_processes) for opt in optimizers]
        lr_scheduler = lr_schedulers[0]  # avoid error in following code
    else:
        lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
    # full fp16/bf16 training
    if args.full_fp16:
        assert args.mixed_precision == "fp16", "full_fp16 requires mixed_precision='fp16'"
        accelerator.print("enable full fp16 training.")
        dit.to(weight_dtype)
    elif args.full_bf16:
        assert args.mixed_precision == "bf16", "full_bf16 requires mixed_precision='bf16'"
        accelerator.print("enable full bf16 training.")
        dit.to(weight_dtype)
    # move text encoder to GPU if not cached
    if not args.cache_text_encoder_outputs and qwen3_text_encoder is not None:
        qwen3_text_encoder.to(accelerator.device)
    clean_memory_on_device(accelerator.device)
    # Prepare with accelerator
    # Temporarily move non-training models off GPU to reduce memory during DDP init
    # if not args.cache_text_encoder_outputs and qwen3_text_encoder is not None:
    #     qwen3_text_encoder.to("cpu")
    # if not cache_latents and vae is not None:
    #     vae.to("cpu")
    # clean_memory_on_device(accelerator.device)
    if args.deepspeed:
        ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=dit)
        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
            ds_model, optimizer, train_dataloader, lr_scheduler
        )
        training_models = [ds_model]
    else:
        if train_dit:
            dit = accelerator.prepare(dit, device_placement=[not is_swapping_blocks])
            if is_swapping_blocks:
                accelerator.unwrap_model(dit).move_to_device_except_swap_blocks(accelerator.device)
        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
    # Move non-training models back to GPU
    if not args.cache_text_encoder_outputs and qwen3_text_encoder is not None:
        qwen3_text_encoder.to(accelerator.device)
    if not cache_latents and vae is not None:
        vae.to(accelerator.device, dtype=weight_dtype)
    if args.full_fp16:
        train_util.patch_accelerator_for_fp16_training(accelerator)
    # resume
    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
    if args.fused_backward_pass:
        import library.adafactor_fused
        library.adafactor_fused.patch_adafactor_fused(optimizer)
        for param_group in optimizer.param_groups:
            for parameter in param_group["params"]:
                if parameter.requires_grad:
                    def create_grad_hook(p_group):
                        def grad_hook(tensor: torch.Tensor):
                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
                            optimizer.step_param(tensor, p_group)
                            tensor.grad = None
                        return grad_hook
                    parameter.register_post_accumulate_grad_hook(create_grad_hook(param_group))
    elif args.blockwise_fused_optimizers:
        # Prepare additional optimizers and lr schedulers
        for i in range(1, len(optimizers)):
            optimizers[i] = accelerator.prepare(optimizers[i])
            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
        # Counters for blockwise gradient hook
        optimizer_hooked_count = {}
        num_parameters_per_group = [0] * len(optimizers)
        parameter_optimizer_map = {}
        for opt_idx, opt in enumerate(optimizers):
            for param_group in opt.param_groups:
                for parameter in param_group["params"]:
                    if parameter.requires_grad:
                        def grad_hook(parameter: torch.Tensor):
                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
                            i = parameter_optimizer_map[parameter]
                            optimizer_hooked_count[i] += 1
                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
                                optimizers[i].step()
                                optimizers[i].zero_grad(set_to_none=True)
                        parameter.register_post_accumulate_grad_hook(grad_hook)
                        parameter_optimizer_map[parameter] = opt_idx
                        num_parameters_per_group[opt_idx] += 1
    # Training loop
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
    accelerator.print("running training")
    accelerator.print(f"  num examples: {train_dataset_group.num_train_images}")
    accelerator.print(f"  num batches per epoch: {len(train_dataloader)}")
    accelerator.print(f"  num epochs: {num_train_epochs}")
    accelerator.print(
        f"  batch size per device: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
    )
    accelerator.print(f"  gradient accumulation steps = {args.gradient_accumulation_steps}")
    accelerator.print(f"  total optimization steps: {args.max_train_steps}")
    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
    global_step = 0
    if accelerator.is_main_process:
        init_kwargs = {}
        if args.wandb_run_name:
            init_kwargs["wandb"] = {"name": args.wandb_run_name}
        if args.log_tracker_config is not None:
            init_kwargs = toml.load(args.log_tracker_config)
        accelerator.init_trackers(
            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
            config=train_util.get_sanitized_config_or_none(args),
            init_kwargs=init_kwargs,
        )
        if "wandb" in [tracker.name for tracker in accelerator.trackers]:
            import wandb
            wandb.define_metric("epoch")
            wandb.define_metric("loss/epoch", step_metric="epoch")
    if is_swapping_blocks:
        accelerator.unwrap_model(dit).prepare_block_swap_before_forward()
    # For --sample_at_first
    optimizer_eval_fn()
    anima_train_utils.sample_images(
        accelerator, args, 0, global_step, dit, vae, vae_scale,
        qwen3_text_encoder, tokenize_strategy, text_encoding_strategy,
        sample_prompts_te_outputs,
    )
    optimizer_train_fn()
    if len(accelerator.trackers) > 0:
        accelerator.log({}, step=0)
    # Show model info
    unwrapped_dit = accelerator.unwrap_model(dit) if dit is not None else None
    if unwrapped_dit is not None:
        logger.info(f"dit device: {unwrapped_dit.t_embedding_norm.weight.device}, dtype: {unwrapped_dit.t_embedding_norm.weight.dtype}")
    if qwen3_text_encoder is not None:
        logger.info(f"qwen3 device: {next(qwen3_text_encoder.parameters()).device}")
    if vae is not None:
        logger.info(f"vae device: {next(vae.parameters()).device}")
    loss_recorder = train_util.LossRecorder()
    epoch = 0
    for epoch in range(num_train_epochs):
        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
        current_epoch.value = epoch + 1
        for m in training_models:
            m.train()
        for step, batch in enumerate(train_dataloader):
            current_step.value = global_step
            if args.blockwise_fused_optimizers:
                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
            with accelerator.accumulate(*training_models):
                # Get latents
                if "latents" in batch and batch["latents"] is not None:
                    latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
                else:
                    with torch.no_grad():
                        # images are already [-1, 1] from IMAGE_TRANSFORMS, add temporal dim
                        images = batch["images"].to(accelerator.device, dtype=weight_dtype)
                        images = images.unsqueeze(2)  # (B, C, 1, H, W)
                        latents = vae.encode(images, vae_scale).to(accelerator.device, dtype=weight_dtype)
                    if torch.any(torch.isnan(latents)):
                        accelerator.print("NaN found in latents, replacing with zeros")
                        latents = torch.nan_to_num(latents, 0, out=latents)
                # Get text encoder outputs
                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                if text_encoder_outputs_list is not None:
                    # Cached outputs
                    text_encoder_outputs_list = text_encoding_strategy.drop_cached_text_encoder_outputs(
                        *text_encoder_outputs_list
                    )
                    prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = text_encoder_outputs_list
                else:
                    # Encode on-the-fly
                    input_ids_list = batch["input_ids_list"]
                    qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask = input_ids_list
                    with torch.no_grad():
                        prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = text_encoding_strategy.encode_tokens(
                            tokenize_strategy,
                            [qwen3_text_encoder],
                            [qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask],
                        )
                # Move to device
                prompt_embeds = prompt_embeds.to(accelerator.device, dtype=weight_dtype)
                attn_mask = attn_mask.to(accelerator.device)
                t5_input_ids = t5_input_ids.to(accelerator.device, dtype=torch.long)
                t5_attn_mask = t5_attn_mask.to(accelerator.device)
                # Noise and timesteps
                noise = torch.randn_like(latents)
                noisy_model_input, timesteps, sigmas = anima_train_utils.get_noisy_model_input_and_timesteps(
                    args, latents, noise, accelerator.device, weight_dtype
                )
                # NaN checks
                if torch.any(torch.isnan(noisy_model_input)):
                    accelerator.print("NaN found in noisy_model_input, replacing with zeros")
                    noisy_model_input = torch.nan_to_num(noisy_model_input, 0, out=noisy_model_input)
                # Create padding mask
                # padding_mask: (B, 1, H_latent, W_latent)
                bs = latents.shape[0]
                h_latent = latents.shape[-2]
                w_latent = latents.shape[-1]
                padding_mask = torch.zeros(
                    bs, 1, h_latent, w_latent,
                    dtype=weight_dtype, device=accelerator.device
                )
                # DiT forward (LLM adapter runs inside forward for DDP gradient sync)
                if is_swapping_blocks:
                    accelerator.unwrap_model(dit).prepare_block_swap_before_forward()
                with accelerator.autocast():
                    model_pred = dit(
                        noisy_model_input,
                        timesteps,
                        prompt_embeds,
                        padding_mask=padding_mask,
                        source_attention_mask=attn_mask,
                        t5_input_ids=t5_input_ids,
                        t5_attn_mask=t5_attn_mask,
                    )
                # Compute loss (rectified flow: target = noise - latents)
                target = noise - latents
                # Weighting
                weighting = anima_train_utils.compute_loss_weighting_for_anima(
                    weighting_scheme=args.weighting_scheme, sigmas=sigmas
                )
                # Loss
                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, None)
                loss = train_util.conditional_loss(
                    model_pred.float(), target.float(), args.loss_type, "none", huber_c
                )
                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                    loss = apply_masked_loss(loss, batch)
                loss = loss.mean([1, 2, 3, 4])  # (B, C, T, H, W) -> (B,)
                if weighting is not None:
                    loss = loss * weighting
                loss_weights = batch["loss_weights"]
                loss = loss * loss_weights
                loss = loss.mean()
                accelerator.backward(loss)
                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                        params_to_clip = []
                        for m in training_models:
                            params_to_clip.extend(m.parameters())
                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad(set_to_none=True)
                else:
                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
                    lr_scheduler.step()
                    if args.blockwise_fused_optimizers:
                        for i in range(1, len(optimizers)):
                            lr_schedulers[i].step()
            # Checks if the accelerator has performed an optimization step
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1
                optimizer_eval_fn()
                anima_train_utils.sample_images(
                    accelerator, args, None, global_step, dit, vae, vae_scale,
                    qwen3_text_encoder, tokenize_strategy, text_encoding_strategy,
                    sample_prompts_te_outputs,
                )
                # Save at specific steps
                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
                    accelerator.wait_for_everyone()
                    if accelerator.is_main_process:
                        anima_train_utils.save_anima_model_on_epoch_end_or_stepwise(
                            args,
                            False,
                            accelerator,
                            save_dtype,
                            epoch,
                            num_train_epochs,
                            global_step,
                            accelerator.unwrap_model(dit) if train_dit else None,
                        )
                optimizer_train_fn()
            current_loss = loss.detach().item()
            if len(accelerator.trackers) > 0:
                logs = {"loss": current_loss}
                train_util.append_lr_to_logs_with_names(
                    logs, lr_scheduler, args.optimizer_type,
                    ["base", "self_attn", "cross_attn", "mlp", "mod", "llm_adapter"] if train_dit else []
                )
                accelerator.log(logs, step=global_step)
            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
            avr_loss: float = loss_recorder.moving_average
            logs = {"avr_loss": avr_loss}
            progress_bar.set_postfix(**logs)
            if global_step >= args.max_train_steps:
                break
        if len(accelerator.trackers) > 0:
            logs = {"loss/epoch": loss_recorder.moving_average, "epoch": epoch + 1}
            accelerator.log(logs, step=global_step)
        accelerator.wait_for_everyone()
        optimizer_eval_fn()
        if args.save_every_n_epochs is not None:
            if accelerator.is_main_process:
                anima_train_utils.save_anima_model_on_epoch_end_or_stepwise(
                    args,
                    True,
                    accelerator,
                    save_dtype,
                    epoch,
                    num_train_epochs,
                    global_step,
                    accelerator.unwrap_model(dit) if train_dit else None,
                )
        anima_train_utils.sample_images(
            accelerator, args, epoch + 1, global_step, dit, vae, vae_scale,
            qwen3_text_encoder, tokenize_strategy, text_encoding_strategy,
            sample_prompts_te_outputs,
        )
    # End training
    is_main_process = accelerator.is_main_process
    dit = accelerator.unwrap_model(dit)
    accelerator.end_training()
    optimizer_eval_fn()
    if args.save_state or args.save_state_on_train_end:
        train_util.save_state_on_train_end(args, accelerator)
    del accelerator
    if is_main_process and train_dit:
        anima_train_utils.save_anima_model_on_train_end(
            args,
            save_dtype,
            epoch,
            global_step,
            dit,
        )
        logger.info("model saved.")
 def setup_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    add_logging_arguments(parser)
    train_util.add_sd_models_arguments(parser)
    train_util.add_dataset_arguments(parser, True, True, True)
    train_util.add_training_arguments(parser, False)
    train_util.add_masked_loss_arguments(parser)
    deepspeed_utils.add_deepspeed_arguments(parser)
    train_util.add_sd_saving_arguments(parser)
    train_util.add_optimizer_arguments(parser)
    config_util.add_config_arguments(parser)
    add_custom_train_arguments(parser)
    train_util.add_dit_training_arguments(parser)
    anima_train_utils.add_anima_training_arguments(parser)
    sai_model_spec.add_model_spec_arguments(parser)
    parser.add_argument(
        "--blockwise_fused_optimizers",
        action="store_true",
        help="enable blockwise optimizers for fused backward pass and optimizer step",
    )
    parser.add_argument(
        "--cpu_offload_checkpointing",
        action="store_true",
        help="offload gradient checkpointing to CPU (reduces VRAM at cost of speed)",
    )
    parser.add_argument(
        "--unsloth_offload_checkpointing",
        action="store_true",
        help="offload activations to CPU RAM using async non-blocking transfers (faster than --cpu_offload_checkpointing). "
        "Cannot be used with --cpu_offload_checkpointing or --blocks_to_swap.",
    )
    parser.add_argument(
        "--skip_latents_validity_check",
        action="store_true",
        help="[Deprecated] use 'skip_cache_check' instead",
    )
    return parser
 if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()
    train_util.verify_command_line_training_args(args)
    args = train_util.read_config_from_file(args, parser)
    train(args)
--- a/anima_train_network.py
+++ b/anima_train_network.py
@@ -0,0 +1,540 @@
 # Anima LoRA training script
 import argparse
 import math
 from typing import Any, Optional, Union
 import torch
 from accelerate import Accelerator
 from library.device_utils import init_ipex, clean_memory_on_device
 init_ipex()
 from library import anima_models, anima_train_utils, anima_utils, strategy_anima, strategy_base, train_util
 import train_network
 from library.utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 class AnimaNetworkTrainer(train_network.NetworkTrainer):
    def __init__(self):
        super().__init__()
        self.sample_prompts_te_outputs = None
        self.vae = None
        self.vae_scale = None
        self.qwen3_text_encoder = None
        self.qwen3_tokenizer = None
        self.t5_tokenizer = None
        self.tokenize_strategy = None
        self.text_encoding_strategy = None
    def assert_extra_args(
        self,
        args,
        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
        val_dataset_group: Optional[train_util.DatasetGroup],
    ):
        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
            logger.warning(
                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled"
            )
            args.cache_text_encoder_outputs = True
        # Anima uses embedding-level dropout (in AnimaTextEncodingStrategy) instead of
        # dataset-level caption dropout, so zero out subset-level rates to allow caching.
        caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
        if caption_dropout_rate > 0:
            logger.info(f"Using embedding-level caption dropout rate: {caption_dropout_rate}")
            if hasattr(train_dataset_group, 'datasets'):
                for dataset in train_dataset_group.datasets:
                    for subset in dataset.subsets:
                        subset.caption_dropout_rate = 0.0
        if args.cache_text_encoder_outputs:
            assert (
                train_dataset_group.is_text_encoder_output_cacheable()
            ), "when caching Text Encoder output, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used"
        assert (
            args.blocks_to_swap is None or args.blocks_to_swap == 0
        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing"
        if getattr(args, 'unsloth_offload_checkpointing', False):
            if not args.gradient_checkpointing:
                logger.warning("unsloth_offload_checkpointing is enabled, so gradient_checkpointing is also enabled")
                args.gradient_checkpointing = True
            assert not args.cpu_offload_checkpointing, \
                "Cannot use both --unsloth_offload_checkpointing and --cpu_offload_checkpointing"
            assert (
                args.blocks_to_swap is None or args.blocks_to_swap == 0
            ), "blocks_to_swap is not supported with unsloth_offload_checkpointing"
        # Flash attention: validate availability
        if getattr(args, 'flash_attn', False):
            try:
                import flash_attn  # noqa: F401
                logger.info("Flash Attention enabled for DiT blocks")
            except ImportError:
                logger.warning("flash_attn package not installed, falling back to PyTorch SDPA")
                args.flash_attn = False
        if getattr(args, 'blockwise_fused_optimizers', False):
            raise ValueError("blockwise_fused_optimizers is not supported with LoRA/NetworkTrainer")
        train_dataset_group.verify_bucket_reso_steps(8)  # WanVAE spatial downscale = 8
        if val_dataset_group is not None:
            val_dataset_group.verify_bucket_reso_steps(8)
    def load_target_model(self, args, weight_dtype, accelerator):
        # Load Qwen3 text encoder (tokenizers already loaded in get_tokenize_strategy)
        logger.info("Loading Qwen3 text encoder...")
        self.qwen3_text_encoder, _ = anima_utils.load_qwen3_text_encoder(
            args.qwen3_path, dtype=weight_dtype, device="cpu"
        )
        self.qwen3_text_encoder.eval()
        # Parse transformer_dtype
        transformer_dtype = None
        if hasattr(args, 'transformer_dtype') and args.transformer_dtype is not None:
            transformer_dtype_map = {
                "float16": torch.float16,
                "bfloat16": torch.bfloat16,
                "float32": torch.float32,
            }
            transformer_dtype = transformer_dtype_map.get(args.transformer_dtype, None)
        # Load DiT
        logger.info("Loading Anima DiT...")
        dit = anima_utils.load_anima_dit(
            args.dit_path,
            dtype=weight_dtype,
            device="cpu",
            transformer_dtype=transformer_dtype,
            llm_adapter_path=getattr(args, 'llm_adapter_path', None),
            disable_mmap=getattr(args, 'disable_mmap_load_safetensors', False),
        )
        # Flash attention
        if getattr(args, 'flash_attn', False):
            dit.set_flash_attn(True)
        # Store unsloth preference so that when the base NetworkTrainer calls
        # dit.enable_gradient_checkpointing(cpu_offload=...), we can override to use unsloth.
        # The base trainer only passes cpu_offload, so we store the flag on the model.
        self._use_unsloth_offload_checkpointing = getattr(args, 'unsloth_offload_checkpointing', False)
        # Block swap
        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
        if self.is_swapping_blocks:
            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
            dit.enable_block_swap(args.blocks_to_swap, accelerator.device)
        # Load VAE
        logger.info("Loading Anima VAE...")
        self.vae, vae_mean, vae_std, self.vae_scale = anima_utils.load_anima_vae(
            args.vae_path, dtype=weight_dtype, device="cpu"
        )
        # Return format: (model_type, text_encoders, vae, unet)
        return "anima", [self.qwen3_text_encoder], self.vae, dit
    def get_tokenize_strategy(self, args):
        # Load tokenizers from paths (called before load_target_model, so self.qwen3_tokenizer isn't set yet)
        self.tokenize_strategy = strategy_anima.AnimaTokenizeStrategy(
            qwen3_path=args.qwen3_path,
            t5_tokenizer_path=getattr(args, 't5_tokenizer_path', None),
            qwen3_max_length=args.qwen3_max_token_length,
            t5_max_length=args.t5_max_token_length,
        )
        # Store references so load_target_model can reuse them
        self.qwen3_tokenizer = self.tokenize_strategy.qwen3_tokenizer
        self.t5_tokenizer = self.tokenize_strategy.t5_tokenizer
        return self.tokenize_strategy
    def get_tokenizers(self, tokenize_strategy: strategy_anima.AnimaTokenizeStrategy):
        return [tokenize_strategy.qwen3_tokenizer]
    def get_latents_caching_strategy(self, args):
        return strategy_anima.AnimaLatentsCachingStrategy(
            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
        )
    def get_text_encoding_strategy(self, args):
        caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
        self.text_encoding_strategy = strategy_anima.AnimaTextEncodingStrategy(
            dropout_rate=caption_dropout_rate,
        )
        return self.text_encoding_strategy
    def post_process_network(self, args, accelerator, network, text_encoders, unet):
        # Qwen3 text encoder is always frozen for Anima
        pass
    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
        if args.cache_text_encoder_outputs:
            return None  # no text encoders needed for encoding
        return text_encoders
    def get_text_encoders_train_flags(self, args, text_encoders):
        return [False]  # Qwen3 always frozen
    def is_train_text_encoder(self, args):
        return False  # Qwen3 text encoder is always frozen for Anima
    def get_text_encoder_outputs_caching_strategy(self, args):
        if args.cache_text_encoder_outputs:
            return strategy_anima.AnimaTextEncoderOutputsCachingStrategy(
                args.cache_text_encoder_outputs_to_disk,
                args.text_encoder_batch_size,
                args.skip_cache_check,
                is_partial=False,
            )
        return None
    def cache_text_encoder_outputs_if_needed(
        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
    ):
        if args.cache_text_encoder_outputs:
            if not args.lowram:
                logger.info("move vae and unet to cpu to save memory")
                org_vae_device = next(vae.parameters()).device
                org_unet_device = unet.device
                vae.to("cpu")
                unet.to("cpu")
                clean_memory_on_device(accelerator.device)
            logger.info("move text encoder to gpu")
            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
            with accelerator.autocast():
                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
            # cache sample prompts
            if args.sample_prompts is not None:
                logger.info(f"cache Text Encoder outputs for sample prompts: {args.sample_prompts}")
                tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
                text_encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
                prompts = train_util.load_prompts(args.sample_prompts)
                sample_prompts_te_outputs = {}
                with accelerator.autocast(), torch.no_grad():
                    for prompt_dict in prompts:
                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
                            if p not in sample_prompts_te_outputs:
                                logger.info(f"  cache TE outputs for: {p}")
                                tokens_and_masks = tokenize_strategy.tokenize(p)
                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
                                    tokenize_strategy,
                                    text_encoders,
                                    tokens_and_masks,
                                    enable_dropout=False,
                                )
                self.sample_prompts_te_outputs = sample_prompts_te_outputs
            # Pre-cache unconditional embeddings for caption dropout before text encoder is deleted
            caption_dropout_rate = getattr(args, 'caption_dropout_rate', 0.0)
            text_encoding_strategy_for_uncond = strategy_base.TextEncodingStrategy.get_strategy()
            if caption_dropout_rate > 0.0:
                tokenize_strategy_for_uncond = strategy_base.TokenizeStrategy.get_strategy()
                with accelerator.autocast():
                    text_encoding_strategy_for_uncond.cache_uncond_embeddings(tokenize_strategy_for_uncond, text_encoders)
            accelerator.wait_for_everyone()
            # move text encoder back to cpu
            logger.info("move text encoder back to cpu")
            text_encoders[0].to("cpu")
            clean_memory_on_device(accelerator.device)
            if not args.lowram:
                logger.info("move vae and unet back to original device")
                vae.to(org_vae_device)
                unet.to(org_unet_device)
        else:
            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
        text_encoders = text_encoder if isinstance(text_encoder, list) else [text_encoder]  # compatibility
        te = self.get_models_for_text_encoding(args, accelerator, text_encoders)
        qwen3_te = te[0] if te is not None else None
        anima_train_utils.sample_images(
            accelerator, args, epoch, global_step, unet, vae, self.vae_scale,
            qwen3_te, self.tokenize_strategy, self.text_encoding_strategy,
            self.sample_prompts_te_outputs,
        )
    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
        noise_scheduler = anima_train_utils.FlowMatchEulerDiscreteScheduler(
            num_train_timesteps=1000, shift=args.discrete_flow_shift
        )
        return noise_scheduler
    def encode_images_to_latents(self, args, vae, images):
        # images are already [-1,1] from IMAGE_TRANSFORMS, add temporal dim
        images = images.unsqueeze(2)  # (B, C, 1, H, W)
        # Ensure scale tensors are on the same device as images
        vae_device = images.device
        scale = [s.to(vae_device) if isinstance(s, torch.Tensor) else s for s in self.vae_scale]
        return vae.encode(images, scale)
    def shift_scale_latents(self, args, latents):
        # Latents already normalized by vae.encode with scale
        return latents
    def get_noise_pred_and_target(
        self,
        args,
        accelerator,
        noise_scheduler,
        latents,
        batch,
        text_encoder_conds,
        unet,
        network,
        weight_dtype,
        train_unet,
        is_train=True,
    ):
        # Sample noise
        noise = torch.randn_like(latents)
        # Get noisy model input and timesteps
        noisy_model_input, timesteps, sigmas = anima_train_utils.get_noisy_model_input_and_timesteps(
            args, latents, noise, accelerator.device, weight_dtype
        )
        # Gradient checkpointing support
        if args.gradient_checkpointing:
            noisy_model_input.requires_grad_(True)
            for t in text_encoder_conds:
                if t is not None and t.dtype.is_floating_point:
                    t.requires_grad_(True)
        # Unpack text encoder conditions
        prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = text_encoder_conds
        # Move to device
        prompt_embeds = prompt_embeds.to(accelerator.device, dtype=weight_dtype)
        attn_mask = attn_mask.to(accelerator.device)
        t5_input_ids = t5_input_ids.to(accelerator.device, dtype=torch.long)
        t5_attn_mask = t5_attn_mask.to(accelerator.device)
        # Create padding mask
        bs = latents.shape[0]
        h_latent = latents.shape[-2]
        w_latent = latents.shape[-1]
        padding_mask = torch.zeros(
            bs, 1, h_latent, w_latent,
            dtype=weight_dtype, device=accelerator.device
        )
        # Prepare block swap
        if self.is_swapping_blocks:
            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
        # Call model (LLM adapter runs inside forward for DDP gradient sync)
        with torch.set_grad_enabled(is_train), accelerator.autocast():
            model_pred = unet(
                noisy_model_input,
                timesteps,
                prompt_embeds,
                padding_mask=padding_mask,
                source_attention_mask=attn_mask,
                t5_input_ids=t5_input_ids,
                t5_attn_mask=t5_attn_mask,
            )
        # Rectified flow target: noise - latents
        target = noise - latents
        # Loss weighting
        weighting = anima_train_utils.compute_loss_weighting_for_anima(
            weighting_scheme=args.weighting_scheme, sigmas=sigmas
        )
        # Differential output preservation
        if "custom_attributes" in batch:
            diff_output_pr_indices = []
            for i, custom_attributes in enumerate(batch["custom_attributes"]):
                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
                    diff_output_pr_indices.append(i)
            if len(diff_output_pr_indices) > 0:
                network.set_multiplier(0.0)
                with torch.no_grad(), accelerator.autocast():
                    if self.is_swapping_blocks:
                        accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
                    model_pred_prior = unet(
                        noisy_model_input[diff_output_pr_indices],
                        timesteps[diff_output_pr_indices],
                        prompt_embeds[diff_output_pr_indices],
                        padding_mask=padding_mask[diff_output_pr_indices],
                        source_attention_mask=attn_mask[diff_output_pr_indices],
                        t5_input_ids=t5_input_ids[diff_output_pr_indices],
                        t5_attn_mask=t5_attn_mask[diff_output_pr_indices],
                    )
                network.set_multiplier(1.0)
                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
        return model_pred, target, timesteps, weighting
    def process_batch(
        self, batch, text_encoders, unet, network, vae, noise_scheduler,
        vae_dtype, weight_dtype, accelerator, args,
        text_encoding_strategy, tokenize_strategy,
        is_train=True, train_text_encoder=True, train_unet=True,
    ) -> torch.Tensor:
        """Override base process_batch for 5D video latents (B, C, T, H, W).
        Base class assumes 4D (B, C, H, W) for loss.mean([1,2,3]) and weighting broadcast.
        """
        import typing
        from library.custom_train_functions import apply_masked_loss
        with torch.no_grad():
            if "latents" in batch and batch["latents"] is not None:
                latents = typing.cast(torch.FloatTensor, batch["latents"].to(accelerator.device))
            else:
                if args.vae_batch_size is None or len(batch["images"]) <= args.vae_batch_size:
                    latents = self.encode_images_to_latents(args, vae, batch["images"].to(accelerator.device, dtype=vae_dtype))
                else:
                    chunks = [
                        batch["images"][i : i + args.vae_batch_size] for i in range(0, len(batch["images"]), args.vae_batch_size)
                    ]
                    list_latents = []
                    for chunk in chunks:
                        with torch.no_grad():
                            chunk = self.encode_images_to_latents(args, vae, chunk.to(accelerator.device, dtype=vae_dtype))
                            list_latents.append(chunk)
                    latents = torch.cat(list_latents, dim=0)
                if torch.any(torch.isnan(latents)):
                    accelerator.print("NaN found in latents, replacing with zeros")
                    latents = typing.cast(torch.FloatTensor, torch.nan_to_num(latents, 0, out=latents))
            latents = self.shift_scale_latents(args, latents)
        # Text encoder conditions
        text_encoder_conds = []
        text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
        if text_encoder_outputs_list is not None:
            text_encoder_conds = text_encoder_outputs_list
        if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
            with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
                input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
                encoded_text_encoder_conds = text_encoding_strategy.encode_tokens(
                    tokenize_strategy,
                    self.get_models_for_text_encoding(args, accelerator, text_encoders),
                    input_ids,
                )
                if args.full_fp16:
                    encoded_text_encoder_conds = [c.to(weight_dtype) for c in encoded_text_encoder_conds]
            if len(text_encoder_conds) == 0:
                text_encoder_conds = encoded_text_encoder_conds
            else:
                for i in range(len(encoded_text_encoder_conds)):
                    if encoded_text_encoder_conds[i] is not None:
                        text_encoder_conds[i] = encoded_text_encoder_conds[i]
        noise_pred, target, timesteps, weighting = self.get_noise_pred_and_target(
            args, accelerator, noise_scheduler, latents, batch,
            text_encoder_conds, unet, network, weight_dtype, train_unet, is_train=is_train,
        )
        huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
        loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
        if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
            loss = apply_masked_loss(loss, batch)
        # Reduce all non-batch dims: (B, C, T, H, W) -> (B,) for 5D, (B, C, H, W) -> (B,) for 4D
        reduce_dims = list(range(1, loss.ndim))
        loss = loss.mean(reduce_dims)
        # Apply weighting after reducing to (B,)
        if weighting is not None:
            loss = loss * weighting
        loss_weights = batch["loss_weights"]
        loss = loss * loss_weights
        loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
        return loss.mean()
    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
        return loss
    def get_sai_model_spec(self, args):
        return train_util.get_sai_model_spec(None, args, False, True, False, is_stable_diffusion_ckpt=True)
    def update_metadata(self, metadata, args):
        metadata["ss_weighting_scheme"] = args.weighting_scheme
        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
        metadata["ss_timestep_sample_method"] = getattr(args, 'timestep_sample_method', 'logit_normal')
        metadata["ss_sigmoid_scale"] = getattr(args, 'sigmoid_scale', 1.0)
    def is_text_encoder_not_needed_for_training(self, args):
        return args.cache_text_encoder_outputs
    def prepare_unet_with_accelerator(
        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
    ) -> torch.nn.Module:
        # The base NetworkTrainer only calls enable_gradient_checkpointing(cpu_offload=True/False),
        # so we re-apply with unsloth_offload if needed (after base has already enabled it).
        if self._use_unsloth_offload_checkpointing and args.gradient_checkpointing:
            unet.enable_gradient_checkpointing(unsloth_offload=True)
        if not self.is_swapping_blocks:
            return super().prepare_unet_with_accelerator(args, accelerator, unet)
        dit = unet
        dit = accelerator.prepare(dit, device_placement=[not self.is_swapping_blocks])
        accelerator.unwrap_model(dit).move_to_device_except_swap_blocks(accelerator.device)
        accelerator.unwrap_model(dit).prepare_block_swap_before_forward()
        return dit
    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train=True):
        # Drop cached text encoder outputs for caption dropout
        text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
        if text_encoder_outputs_list is not None:
            text_encoding_strategy: strategy_anima.AnimaTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
            text_encoder_outputs_list = text_encoding_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
            batch["text_encoder_outputs_list"] = text_encoder_outputs_list
    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
        if self.is_swapping_blocks:
            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
 def setup_parser() -> argparse.ArgumentParser:
    parser = train_network.setup_parser()
    train_util.add_dit_training_arguments(parser)
    anima_train_utils.add_anima_training_arguments(parser)
    parser.add_argument(
        "--unsloth_offload_checkpointing",
        action="store_true",
        help="offload activations to CPU RAM using async non-blocking transfers (faster than --cpu_offload_checkpointing). "
        "Cannot be used with --cpu_offload_checkpointing or --blocks_to_swap.",
    )
    return parser
 if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()
    train_util.verify_command_line_training_args(args)
    args = train_util.read_config_from_file(args, parser)
    trainer = AnimaNetworkTrainer()
    trainer.train(args)
--- a/configs/qwen3_06b/config.json
+++ b/configs/qwen3_06b/config.json
@@ -0,0 +1,30 @@
 {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
 }
--- a/configs/qwen3_06b/merges.txt
+++ b/configs/qwen3_06b/merges.txt
--- a/configs/qwen3_06b/tokenizer.json
+++ b/configs/qwen3_06b/tokenizer.json
--- a/configs/qwen3_06b/tokenizer_config.json
+++ b/configs/qwen3_06b/tokenizer_config.json
@@ -0,0 +1,239 @@
 {
  "add_bos_token": false,
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151665": {
      "content": "<tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151666": {
      "content": "</tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151667": {
      "content": "<think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151668": {
      "content": "</think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|endoftext|>",
  "errors": "replace",
  "model_max_length": 131072,
  "pad_token": "<|endoftext|>",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/configs/qwen3_06b/vocab.json
+++ b/configs/qwen3_06b/vocab.json
--- a/configs/t5_old/config.json
+++ b/configs/t5_old/config.json
@@ -0,0 +1,51 @@
 {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 65536,
  "d_kv": 128,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 128,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "vocab_size": 32128
 }
--- a/configs/t5_old/spiece.model
+++ b/configs/t5_old/spiece.model
--- a/configs/t5_old/tokenizer.json
+++ b/configs/t5_old/tokenizer.json
--- a/docs/anima_train_network.md
+++ b/docs/anima_train_network.md
@@ -0,0 +1,556 @@
 # LoRA Training Guide for Anima using `anima_train_network.py` / `anima_train_network.py` を用いたAnima モデルのLoRA学習ガイド
 This document explains how to train LoRA (Low-Rank Adaptation) models for Anima using `anima_train_network.py` in the `sd-scripts` repository.
 <details>
 <summary>日本語</summary>
 このドキュメントでは、`sd-scripts`リポジトリに含まれる`anima_train_network.py`を使用して、Anima モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
 </details>
 ## 1. Introduction / はじめに
 `anima_train_network.py` trains additional networks such as LoRA for Anima models. Anima adopts a DiT (Diffusion Transformer) architecture based on the MiniTrainDIT design with Rectified Flow training. It uses a Qwen3-0.6B text encoder, an LLM Adapter (6-layer transformer bridge from Qwen3 to T5-compatible space), and a WanVAE (16-channel, 8x spatial downscale).
 This guide assumes you already understand the basics of LoRA training. For common usage and options, see the [train_network.py guide](train_network.md). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
 **Prerequisites:**
 * The `sd-scripts` repository has been cloned and the Python environment is ready.
 * A training dataset has been prepared. See the [Dataset Configuration Guide](./config_README-en.md).
 * Anima model files for training are available.
 <details>
 <summary>日本語</summary>
 `anima_train_network.py`は、Anima モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。AnimaはMiniTrainDIT設計に基づくDiT (Diffusion Transformer) アーキテクチャを採用しており、Rectified Flow学習を使用します。テキストエンコーダーとしてQwen3-0.6B、LLM Adapter (Qwen3からT5互換空間への6層Transformerブリッジ)、およびWanVAE (16チャンネル、8倍空間ダウンスケール) を使用します。
 このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sd3_train_network.py`](sd3_train_network.md) や [`flux_train_network.py`](flux_train_network.md) と同様のものがあるため、そちらも参考にしてください。
 **前提条件:**
 * `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
 * 学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](./config_README-en.md)を参照してください）
 * 学習対象のAnimaモデルファイルが準備できていること。
 </details>
 ## 2. Differences from `train_network.py` / `train_network.py` との違い
 `anima_train_network.py` is based on `train_network.py` but modified for Anima . Main differences are:
 * **Target models:** Anima DiT models.
 * **Model structure:** Uses a MiniTrainDIT (Transformer based) instead of U-Net. Employs a single text encoder (Qwen3-0.6B), an LLM Adapter that bridges Qwen3 embeddings to T5-compatible cross-attention space, and a WanVAE (16-channel latent space with 8x spatial downscale).
 * **Arguments:** Options exist to specify the Anima DiT model, Qwen3 text encoder, WanVAE, LLM adapter, and T5 tokenizer separately.
 * **Incompatible arguments:** Stable Diffusion v1/v2 options such as `--v2`, `--v_parameterization` and `--clip_skip` are not used.
 * **Anima specific options:** Additional parameters for component-wise learning rates (self_attn, cross_attn, mlp, mod, llm_adapter), timestep sampling, discrete flow shift, and flash attention.
 * **6 Parameter Groups:** Independent learning rates for `base`, `self_attn`, `cross_attn`, `mlp`, `adaln_modulation`, and `llm_adapter` components.
 <details>
 <summary>日本語</summary>
 `anima_train_network.py`は`train_network.py`をベースに、Anima モデルに対応するための変更が加えられています。主な違いは以下の通りです。
 * **対象モデル:** Anima DiTモデルを対象とします。
 * **モデル構造:** U-Netの代わりにMiniTrainDIT (Transformerベース) を使用します。テキストエンコーダーとしてQwen3-0.6B、Qwen3埋め込みをT5互換のクロスアテンション空間に変換するLLM Adapter、およびWanVAE (16チャンネル潜在空間、8倍空間ダウンスケール) を使用します。
 * **引数:** Anima DiTモデル、Qwen3テキストエンコーダー、WanVAE、LLM Adapter、T5トークナイザーを個別に指定する引数があります。
 * **一部引数の非互換性:** Stable Diffusion v1/v2向けの引数（例: `--v2`, `--v_parameterization`, `--clip_skip`）はAnimaの学習では使用されません。
 * **Anima特有の引数:** コンポーネント別学習率（self_attn, cross_attn, mlp, mod, llm_adapter）、タイムステップサンプリング、離散フローシフト、Flash Attentionに関する引数が追加されています。
 * **6パラメータグループ:** `base`、`self_attn`、`cross_attn`、`mlp`、`adaln_modulation`、`llm_adapter`の各コンポーネントに対して独立した学習率を設定できます。
 </details>
 ## 3. Preparation / 準備
 The following files are required before starting training:
 1. **Training script:** `anima_train_network.py`
 2. **Anima DiT model file:** `.safetensors` file for the base DiT model.
 3. **Qwen3-0.6B text encoder:** Either a HuggingFace model directory or a single `.safetensors` file (requires `configs/qwen3_06b/` config files).
 4. **WanVAE model file:** `.safetensors` or `.pth` file for the VAE.
 5. **LLM Adapter model file (optional):** `.safetensors` file. If not provided separately, the adapter is loaded from the DiT file if the key `llm_adapter.out_proj.weight` exists.
 6. **T5 Tokenizer (optional):** If not specified, uses the bundled tokenizer at `configs/t5_old/`.
 7. **Dataset definition file (.toml):** Dataset settings in TOML format. (See the [Dataset Configuration Guide](./config_README-en.md).) In this document we use `my_anima_dataset_config.toml` as an example.
 **Notes:**
 * When using a single `.safetensors` file for Qwen3, download the `config.json`, `tokenizer.json`, `tokenizer_config.json`, and `vocab.json` from the [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) HuggingFace repository into the `configs/qwen3_06b/` directory.
 * The T5 tokenizer only needs the tokenizer files (not the T5 model weights). It uses the vocabulary from `google/t5-v1_1-xxl`.
 * Models are saved with a `net.` prefix on all keys for ComfyUI compatibility.
 <details>
 <summary>日本語</summary>
 学習を開始する前に、以下のファイルが必要です。
 1. **学習スクリプト:** `anima_train_network.py`
 2. **Anima DiTモデルファイル:** ベースとなるDiTモデルの`.safetensors`ファイル。
 3. **Qwen3-0.6Bテキストエンコーダー:** HuggingFaceモデルディレクトリまたは単体の`.safetensors`ファイル（`configs/qwen3_06b/`の設定ファイルが必要）。
 4. **WanVAEモデルファイル:** VAEの`.safetensors`または`.pth`ファイル。
 5. **LLM Adapterモデルファイル（オプション）:** `.safetensors`ファイル。個別に指定しない場合、DiTファイル内に`llm_adapter.out_proj.weight`キーが存在すればそこから読み込まれます。
 6. **T5トークナイザー（オプション）:** 指定しない場合、`configs/t5_old/`のバンドル版トークナイザーを使用します。
 7. **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](./config_README-en.md)を参照してください）。例として`my_anima_dataset_config.toml`を使用します。
 **注意:**
 * Qwen3の単体`.safetensors`ファイルを使用する場合、[Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) HuggingFaceリポジトリから`config.json`、`tokenizer.json`、`tokenizer_config.json`、`vocab.json`をダウンロードし、`configs/qwen3_06b/`ディレクトリに配置してください。
 * T5トークナイザーはトークナイザーファイルのみ必要です（T5モデルの重みは不要）。`google/t5-v1_1-xxl`の語彙を使用します。
 * モデルはComfyUI互換のため、すべてのキーに`net.`プレフィックスを付けて保存されます。
 </details>
 ## 4. Running the Training / 学習の実行
 Execute `anima_train_network.py` from the terminal to start training. The overall command-line format is the same as `train_network.py`, but Anima specific options must be supplied.
 Example command:
 ```bash
 accelerate launch --num_cpu_threads_per_process 1 anima_train_network.py \
  --dit_path="<path to Anima DiT model>" \
  --qwen3_path="<path to Qwen3-0.6B model or directory>" \
  --vae_path="<path to WanVAE model>" \
  --llm_adapter_path="<path to LLM adapter model>" \
  --dataset_config="my_anima_dataset_config.toml" \
  --output_dir="<output directory>" \
  --output_name="my_anima_lora" \
  --save_model_as=safetensors \
  --network_module=networks.lora_anima \
  --network_dim=8 \
  --network_alpha=8 \
  --learning_rate=1e-4 \
  --optimizer_type="AdamW8bit" \
  --lr_scheduler="constant" \
  --timestep_sample_method="logit_normal" \
  --discrete_flow_shift=3.0 \
  --max_train_epochs=10 \
  --save_every_n_epochs=1 \
  --mixed_precision="bf16" \
  --gradient_checkpointing \
  --cache_latents \
  --cache_text_encoder_outputs \
  --blocks_to_swap=18
 ```
 *(Write the command on one line or use `\` or `^` for line breaks.)*
 <details>
 <summary>日本語</summary>
 学習は、ターミナルから`anima_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、Anima特有の引数を指定する必要があります。
 コマンドラインの例は英語のドキュメントを参照してください。
 ※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
 </details>
 ### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
 Besides the arguments explained in the [train_network.py guide](train_network.md), specify the following Anima specific options. For shared options (`--output_dir`, `--output_name`, `--network_module`, etc.), see that guide.
 #### Model Options [Required] / モデル関連 [必須]
 * `--dit_path="<path to Anima DiT model>"` **[Required]**
  - Path to the Anima DiT model `.safetensors` file. The model config (channels, blocks, heads) is auto-detected from the state dict. ComfyUI format with `net.` prefix is supported.
 * `--qwen3_path="<path to Qwen3-0.6B model>"` **[Required]**
  - Path to the Qwen3-0.6B text encoder. Can be a HuggingFace model directory or a single `.safetensors` file. The text encoder is always frozen during training.
 * `--vae_path="<path to WanVAE model>"` **[Required]**
  - Path to the WanVAE model `.safetensors` or `.pth` file. Fixed config: `dim=96, z_dim=16`.
 * `--llm_adapter_path="<path to LLM adapter>"` *[Optional]*
  - Path to a separate LLM adapter weights file. If omitted, the adapter is loaded from the DiT file when the key `llm_adapter.out_proj.weight` exists.
 * `--t5_tokenizer_path="<path to T5 tokenizer>"` *[Optional]*
  - Path to the T5 tokenizer directory. If omitted, uses the bundled config at `configs/t5_old/`.
 #### Anima Training Parameters / Anima 学習パラメータ
 * `--timestep_sample_method=<choice>`
  - Timestep sampling method. Choose from `logit_normal` (default) or `uniform`.
 * `--discrete_flow_shift=<float>`
  - Shift for the timestep distribution in Rectified Flow training. Default `3.0`. The shift formula is `t_shifted = (t * shift) / (1 + (shift - 1) * t)`.
 * `--sigmoid_scale=<float>`
  - Scale factor for `logit_normal` timestep sampling. Default `1.0`.
 * `--qwen3_max_token_length=<integer>`
  - Maximum token length for the Qwen3 tokenizer. Default `512`.
 * `--t5_max_token_length=<integer>`
  - Maximum token length for the T5 tokenizer. Default `512`.
 * `--flash_attn`
  - Use Flash Attention for DiT self/cross-attention. Requires `pip install flash-attn`. Falls back to PyTorch SDPA if the package is not installed. Note: Flash Attention is only applied to DiT blocks; the LLM Adapter uses standard attention because it requires attention masks.
 * `--transformer_dtype=<choice>`
  - Separate dtype for transformer blocks. Choose from `float16`, `bfloat16`, `float32`. If not specified, uses the same dtype as `--mixed_precision`.
 #### Component-wise Learning Rates / コンポーネント別学習率
 Anima supports 6 independent learning rate groups. Set to `0` to freeze a component:
 * `--self_attn_lr=<float>` - Learning rate for self-attention layers. Default: same as `--learning_rate`.
 * `--cross_attn_lr=<float>` - Learning rate for cross-attention layers. Default: same as `--learning_rate`.
 * `--mlp_lr=<float>` - Learning rate for MLP layers. Default: same as `--learning_rate`.
 * `--mod_lr=<float>` - Learning rate for AdaLN modulation layers. Default: same as `--learning_rate`.
 * `--llm_adapter_lr=<float>` - Learning rate for LLM adapter layers. Default: same as `--learning_rate`.
 #### Memory and Speed / メモリ・速度関連
 * `--blocks_to_swap=<integer>` **[Experimental]**
  - Number of Transformer blocks to swap between CPU and GPU. More blocks reduce VRAM but slow training. Maximum values depend on model size:
    - 28-block model: max **26**
    - 36-block model: max **34**
    - 20-block model: max **18**
  - Cannot be used with `--cpu_offload_checkpointing` or `--unsloth_offload_checkpointing`.
 * `--unsloth_offload_checkpointing`
  - Offload activations to CPU RAM using async non-blocking transfers. Faster than `--cpu_offload_checkpointing`. Cannot be combined with `--cpu_offload_checkpointing` or `--blocks_to_swap`.
 * `--cache_text_encoder_outputs`
  - Cache Qwen3 text encoder outputs to reduce VRAM usage. Recommended when not training text encoder LoRA.
 * `--cache_text_encoder_outputs_to_disk`
  - Cache text encoder outputs to disk. Auto-enables `--cache_text_encoder_outputs`.
 * `--cache_latents`, `--cache_latents_to_disk`
  - Cache WanVAE latent outputs.
 * `--fp8_base`
  - Use FP8 precision for the base model to reduce VRAM usage.
 #### Incompatible or Deprecated Options / 非互換・非推奨の引数
 * `--v2`, `--v_parameterization`, `--clip_skip` - Options for Stable Diffusion v1/v2 that are not used for Anima training.
 <details>
 <summary>日本語</summary>
 [`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のAnima特有の引数を指定します。共通の引数については、上記ガイドを参照してください。
 #### モデル関連 [必須]
 * `--dit_path="<path to Anima DiT model>"` **[必須]** - Anima DiTモデルの`.safetensors`ファイルのパスを指定します。
 * `--qwen3_path="<path to Qwen3-0.6B model>"` **[必須]** - Qwen3-0.6Bテキストエンコーダーのパスを指定します。
 * `--vae_path="<path to WanVAE model>"` **[必須]** - WanVAEモデルのパスを指定します。
 * `--llm_adapter_path="<path to LLM adapter>"` *[オプション]* - 個別のLLM Adapterの重みファイルのパス。
 * `--t5_tokenizer_path="<path to T5 tokenizer>"` *[オプション]* - T5トークナイザーディレクトリのパス。
 #### Anima 学習パラメータ
 * `--timestep_sample_method` - タイムステップのサンプリング方法。`logit_normal`（デフォルト）または`uniform`。
 * `--discrete_flow_shift` - Rectified Flow学習のタイムステップ分布シフト。デフォルト`3.0`。
 * `--sigmoid_scale` - logit_normalタイムステップサンプリングのスケール係数。デフォルト`1.0`。
 * `--qwen3_max_token_length` - Qwen3トークナイザーの最大トークン長。デフォルト`512`。
 * `--t5_max_token_length` - T5トークナイザーの最大トークン長。デフォルト`512`。
 * `--flash_attn` - DiTのself/cross-attentionにFlash Attentionを使用。`pip install flash-attn`が必要。
 * `--transformer_dtype` - Transformerブロック用の個別dtype。
 #### コンポーネント別学習率
 Animaは6つの独立した学習率グループをサポートします。`0`に設定するとそのコンポーネントをフリーズします：
 * `--self_attn_lr` - Self-attention層の学習率。
 * `--cross_attn_lr` - Cross-attention層の学習率。
 * `--mlp_lr` - MLP層の学習率。
 * `--mod_lr` - AdaLNモジュレーション層の学習率。
 * `--llm_adapter_lr` - LLM Adapter層の学習率。
 #### メモリ・速度関連
 * `--blocks_to_swap` **[実験的機能]** - TransformerブロックをCPUとGPUでスワップしてVRAMを節約。
 * `--unsloth_offload_checkpointing` - 非同期転送でアクティベーションをCPU RAMにオフロード。
 * `--cache_text_encoder_outputs` - Qwen3の出力をキャッシュしてメモリ使用量を削減。
 * `--cache_latents`, `--cache_latents_to_disk` - WanVAEの出力をキャッシュ。
 * `--fp8_base` - ベースモデルにFP8精度を使用。
 </details>
 ### 4.2. Starting Training / 学習の開始
 After setting the required arguments, run the command to begin training. The overall flow and how to check logs are the same as in the [train_network.py guide](train_network.md#32-starting-the-training--学習の開始).
 <details>
 <summary>日本語</summary>
 必要な引数を設定したら、コマンドを実行して学習を開始します。全体の流れやログの確認方法は、[train_network.pyのガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 </details>
 ## 5. LoRA Target Modules / LoRAの学習対象モジュール
 When training LoRA with `anima_train_network.py`, the following modules are targeted:
 * **DiT Blocks (`Block`)**: Self-attention, cross-attention, MLP, and AdaLN modulation layers within each transformer block.
 * **LLM Adapter Blocks (`LLMAdapterTransformerBlock`)**: Only when `--network_args "train_llm_adapter=True"` is specified.
 * **Text Encoder (Qwen3)**: Only when `--network_train_unet_only` is NOT specified.
 The LoRA network module is `networks.lora_anima`.
 ### 5.1. Layer-specific Rank Configuration / 各層に対するランク指定
 You can specify different ranks (network_dim) for each component of the Anima model. Setting `0` disables LoRA for that component.
 | network_args | Target Component |
 |---|---|
 | `self_attn_dim` | Self-attention layers in DiT blocks |
 | `cross_attn_dim` | Cross-attention layers in DiT blocks |
 | `mlp_dim` | MLP layers in DiT blocks |
 | `mod_dim` | AdaLN modulation layers in DiT blocks |
 | `llm_adapter_dim` | LLM adapter layers (requires `train_llm_adapter=True`) |
 Example usage:
 ```
 --network_args "self_attn_dim=8" "cross_attn_dim=4" "mlp_dim=8" "mod_dim=4"
 ```
 ### 5.2. Embedding Layer LoRA / 埋め込み層LoRA
 You can apply LoRA to embedding/output layers by specifying `emb_dims` in network_args as a comma-separated list of 3 numbers:
 ```
 --network_args "emb_dims=[8,4,8]"
 ```
 Each number corresponds to:
 1. `x_embedder` (patch embedding)
 2. `t_embedder` (timestep embedding)
 3. `final_layer` (output layer)
 Setting `0` disables LoRA for that layer.
 ### 5.3. Block Selection for Training / 学習するブロックの指定
 You can specify which DiT blocks to train using `train_block_indices` in network_args. The indices are 0-based. Default is to train all blocks.
 Specify indices as comma-separated integers or ranges:
 ```
 --network_args "train_block_indices=0-5,10,15-27"
 ```
 Special values: `all` (train all blocks), `none` (skip all blocks).
 ### 5.4. LLM Adapter LoRA / LLM Adapter LoRA
 To apply LoRA to the LLM Adapter blocks:
 ```
 --network_args "train_llm_adapter=True" "llm_adapter_dim=4"
 ```
 ### 5.5. Other Network Args / その他のネットワーク引数
 * `--network_args "verbose=True"` - Print all LoRA module names and their dimensions.
 * `--network_args "rank_dropout=0.1"` - Rank dropout rate.
 * `--network_args "module_dropout=0.1"` - Module dropout rate.
 * `--network_args "loraplus_lr_ratio=2.0"` - LoRA+ learning rate ratio.
 * `--network_args "loraplus_unet_lr_ratio=2.0"` - LoRA+ learning rate ratio for DiT only.
 * `--network_args "loraplus_text_encoder_lr_ratio=2.0"` - LoRA+ learning rate ratio for text encoder only.
 <details>
 <summary>日本語</summary>
 `anima_train_network.py`でLoRAを学習させる場合、デフォルトでは以下のモジュールが対象となります。
 * **DiTブロック (`Block`)**: 各Transformerブロック内のSelf-attention、Cross-attention、MLP、AdaLNモジュレーション層。
 * **LLM Adapterブロック (`LLMAdapterTransformerBlock`)**: `--network_args "train_llm_adapter=True"`を指定した場合のみ。
 * **テキストエンコーダー (Qwen3)**: `--network_train_unet_only`を指定しない場合のみ。
 ### 5.1. 各層のランクを指定する
 `--network_args`で各コンポーネントに異なるランクを指定できます。`0`を指定するとその層にはLoRAが適用されません。
 |network_args|対象コンポーネント|
 |---|---|
 |`self_attn_dim`|DiTブロック内のSelf-attention層|
 |`cross_attn_dim`|DiTブロック内のCross-attention層|
 |`mlp_dim`|DiTブロック内のMLP層|
 |`mod_dim`|DiTブロック内のAdaLNモジュレーション層|
 |`llm_adapter_dim`|LLM Adapter層（`train_llm_adapter=True`が必要）|
 ### 5.2. 埋め込み層LoRA
 `emb_dims`で埋め込み/出力層にLoRAを適用できます。3つの数値をカンマ区切りで指定します。
 各数値は `x_embedder`（パッチ埋め込み）、`t_embedder`（タイムステップ埋め込み）、`final_layer`（出力層）に対応します。
 ### 5.3. 学習するブロックの指定
 `train_block_indices`でLoRAを適用するDiTブロックを指定できます。
 ### 5.4. LLM Adapter LoRA
 LLM AdapterブロックにLoRAを適用するには：`--network_args "train_llm_adapter=True" "llm_adapter_dim=4"`
 ### 5.5. その他のネットワーク引数
 * `verbose=True` - 全LoRAモジュール名とdimを表示
 * `rank_dropout` - ランクドロップアウト率
 * `module_dropout` - モジュールドロップアウト率
 * `loraplus_lr_ratio` - LoRA+学習率比率
 </details>
 ## 6. Using the Trained Model / 学習済みモデルの利用
 When training finishes, a LoRA model file (e.g. `my_anima_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support Anima , such as ComfyUI with appropriate nodes.
 <details>
 <summary>日本語</summary>
 学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_anima_lora.safetensors`）が保存されます。このファイルは、Anima モデルに対応した推論環境（例: ComfyUI + 適切なノード）で使用できます。
 </details>
 ## 7. Advanced Settings / 高度な設定
 ### 7.1. VRAM Usage Optimization / VRAM使用量の最適化
 Anima models can be large, so GPUs with limited VRAM may require optimization:
 #### Key VRAM Reduction Options
 - **`--fp8_base`**: Enables training in FP8 format for the DiT model.
 - **`--blocks_to_swap <number>`**: Swaps blocks between CPU and GPU to reduce VRAM usage. Higher numbers save more VRAM but reduce training speed. See model-specific max values in section 4.1.
 - **`--unsloth_offload_checkpointing`**: Offloads gradient checkpoints to CPU using async non-blocking transfers. Faster than `--cpu_offload_checkpointing`. Cannot be combined with `--blocks_to_swap`.
 - **`--gradient_checkpointing`**: Standard gradient checkpointing to reduce VRAM at the cost of compute.
 - **`--cache_text_encoder_outputs`**: Caches Qwen3 outputs so the text encoder can be freed from VRAM during training.
 - **`--cache_latents`**: Caches WanVAE outputs so the VAE can be freed from VRAM during training.
 - **Using Adafactor optimizer**: Can reduce VRAM usage:
  ```
  --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
  ```
 <details>
 <summary>日本語</summary>
 Animaモデルは大きい場合があるため、VRAMが限られたGPUでは最適化が必要です。
 主要なVRAM削減オプション：
 - `--fp8_base`: FP8形式での学習を有効化
 - `--blocks_to_swap`: CPUとGPU間でブロックをスワップ
 - `--unsloth_offload_checkpointing`: 非同期転送でアクティベーションをCPUにオフロード
 - `--gradient_checkpointing`: 標準的な勾配チェックポイント
 - `--cache_text_encoder_outputs`: Qwen3の出力をキャッシュ
 - `--cache_latents`: WanVAEの出力をキャッシュ
 - Adafactorオプティマイザの使用
 </details>
 ### 7.2. Training Settings / 学習設定
 #### Timestep Sampling
 The `--timestep_sample_method` option specifies how timesteps (0-1) are sampled:
 - `logit_normal` (default): Sample from Normal(0,1), multiply by `sigmoid_scale`, apply sigmoid. Good general-purpose option.
 - `uniform`: Uniform random sampling from [0, 1].
 #### Discrete Flow Shift
 The `--discrete_flow_shift` option (default `3.0`) shifts the timestep distribution toward higher noise levels. The formula is:
 ```
 t_shifted = (t * shift) / (1 + (shift - 1) * t)
 ```
 Timesteps are clamped to `[1e-5, 1-1e-5]` after shifting.
 #### Loss Weighting
 The `--weighting_scheme` option specifies loss weighting by timestep:
 - `uniform` (default): Equal weight for all timesteps.
 - `sigma_sqrt`: Weight by `sigma^(-2)`.
 - `cosmap`: Weight by `2 / (pi * (1 - 2*sigma + 2*sigma^2))`.
 - `none`: Same as uniform.
 #### Caption Dropout
 Use `--caption_dropout_rate` for embedding-level caption dropout. This is handled by `AnimaTextEncodingStrategy` and is compatible with text encoder output caching. The subset-level `caption_dropout_rate` is automatically zeroed when this is set.
 <details>
 <summary>日本語</summary>
 #### タイムステップサンプリング
 `--timestep_sample_method`でタイムステップのサンプリング方法を指定します：
 - `logit_normal`（デフォルト）: 正規分布からサンプリングし、sigmoidを適用。
 - `uniform`: [0, 1]の一様分布からサンプリング。
 #### 離散フローシフト
 `--discrete_flow_shift`（デフォルト`3.0`）はタイムステップ分布を高ノイズ側にシフトします。
 #### 損失の重み付け
 `--weighting_scheme`でタイムステップごとの損失の重み付けを指定します。
 #### キャプションドロップアウト
 `--caption_dropout_rate`で埋め込みレベルのキャプションドロップアウトを使用します。テキストエンコーダー出力のキャッシュと互換性があります。
 </details>
 ### 7.3. Text Encoder LoRA Support / Text Encoder LoRAのサポート
 Anima LoRA training supports training Qwen3 text encoder LoRA:
 - To train only DiT: specify `--network_train_unet_only`
 - To train DiT and Qwen3: omit `--network_train_unet_only`
 You can specify a separate learning rate for Qwen3 with `--text_encoder_lr`. If not specified, the default `--learning_rate` is used.
 <details>
 <summary>日本語</summary>
 Anima LoRA学習では、Qwen3テキストエンコーダーのLoRAもトレーニングできます。
 - DiTのみ学習: `--network_train_unet_only`を指定
 - DiTとQwen3を学習: `--network_train_unet_only`を省略
 </details>
 ## 8. Other Training Options / その他の学習オプション
 - **`--loss_type`**: Loss function for training. Default `l2`.
  - `l1`: L1 loss.
  - `l2`: L2 loss (mean squared error).
  - `huber`: Huber loss.
  - `smooth_l1`: Smooth L1 loss.
 - **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: Parameters for Huber loss when `--loss_type` is `huber` or `smooth_l1`.
 - **`--ip_noise_gamma`**, **`--ip_noise_gamma_random_strength`**: Input Perturbation noise gamma values.
 - **`--fused_backward_pass`**: Fuses the backward pass and optimizer step to reduce VRAM usage. Only works with Adafactor. For details, see the [`sdxl_train_network.py` guide](sdxl_train_network.md).
 - **`--weighting_scheme`**, **`--logit_mean`**, **`--logit_std`**, **`--mode_scale`**: Timestep loss weighting options. For details, refer to the [`sd3_train_network.md` guide](sd3_train_network.md).
 <details>
 <summary>日本語</summary>
 - **`--loss_type`**: 学習に用いる損失関数。デフォルト`l2`。`l1`, `l2`, `huber`, `smooth_l1`から選択。
 - **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: Huber損失のパラメータ。
 - **`--ip_noise_gamma`**: Input Perturbationノイズガンマ値。
 - **`--fused_backward_pass`**: バックワードパスとオプティマイザステップの融合。
 - **`--weighting_scheme`** 等: タイムステップ損失の重み付け。詳細は[`sd3_train_network.md`](sd3_train_network.md)を参照。
 </details>
 ## 9. Others / その他
 ### Metadata Saved in LoRA Models
 The following Anima-specific metadata is saved in the LoRA model file:
 * `ss_weighting_scheme`
 * `ss_discrete_flow_shift`
 * `ss_timestep_sample_method`
 * `ss_sigmoid_scale`
 <details>
 <summary>日本語</summary>
 `anima_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python anima_train_network.py --help`) を参照してください。
 ### LoRAモデルに保存されるメタデータ
 以下のAnima固有のメタデータがLoRAモデルファイルに保存されます：
 * `ss_weighting_scheme`
 * `ss_discrete_flow_shift`
 * `ss_timestep_sample_method`
 * `ss_sigmoid_scale`
 </details>
--- a/library/anima_models.py
+++ b/library/anima_models.py
--- a/library/anima_train_utils.py
+++ b/library/anima_train_utils.py
@@ -0,0 +1,665 @@
 # Anima Training Utilities
 import argparse
 import math
 import os
 import time
 from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 from safetensors.torch import save_file
 from accelerate import Accelerator, PartialState
 from tqdm import tqdm
 from PIL import Image
 from library.device_utils import init_ipex, clean_memory_on_device
 init_ipex()
 from .utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 from library import anima_models, anima_utils, strategy_base, train_util
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler, get_sigmas
 # Anima-specific training arguments
 def add_anima_training_arguments(parser: argparse.ArgumentParser):
    """Add Anima-specific training arguments to the parser."""
    parser.add_argument(
        "--dit_path",
        type=str,
        default=None,
        help="Path to Anima DiT model safetensors file",
    )
    parser.add_argument(
        "--vae_path",
        type=str,
        default=None,
        help="Path to WanVAE safetensors/pth file",
    )
    parser.add_argument(
        "--qwen3_path",
        type=str,
        default=None,
        help="Path to Qwen3-0.6B model (safetensors file or directory)",
    )
    parser.add_argument(
        "--llm_adapter_path",
        type=str,
        default=None,
        help="Path to separate LLM adapter weights. If None, adapter is loaded from DiT file if present",
    )
    parser.add_argument(
        "--llm_adapter_lr",
        type=float,
        default=None,
        help="Learning rate for LLM adapter. None=same as base LR, 0=freeze adapter",
    )
    parser.add_argument(
        "--self_attn_lr",
        type=float,
        default=None,
        help="Learning rate for self-attention layers. None=same as base LR, 0=freeze",
    )
    parser.add_argument(
        "--cross_attn_lr",
        type=float,
        default=None,
        help="Learning rate for cross-attention layers. None=same as base LR, 0=freeze",
    )
    parser.add_argument(
        "--mlp_lr",
        type=float,
        default=None,
        help="Learning rate for MLP layers. None=same as base LR, 0=freeze",
    )
    parser.add_argument(
        "--mod_lr",
        type=float,
        default=None,
        help="Learning rate for AdaLN modulation layers. None=same as base LR, 0=freeze",
    )
    parser.add_argument(
        "--t5_tokenizer_path",
        type=str,
        default=None,
        help="Path to T5 tokenizer directory. If None, uses default configs/t5_old/",
    )
    parser.add_argument(
        "--qwen3_max_token_length",
        type=int,
        default=512,
        help="Maximum token length for Qwen3 tokenizer (default: 512)",
    )
    parser.add_argument(
        "--t5_max_token_length",
        type=int,
        default=512,
        help="Maximum token length for T5 tokenizer (default: 512)",
    )
    parser.add_argument(
        "--discrete_flow_shift",
        type=float,
        default=1.0,
        help="Timestep distribution shift for rectified flow training (default: 1.0)",
    )
    parser.add_argument(
        "--timestep_sample_method",
        type=str,
        default="logit_normal",
        choices=["logit_normal", "uniform"],
        help="Timestep sampling method (default: logit_normal)",
    )
    parser.add_argument(
        "--sigmoid_scale",
        type=float,
        default=1.0,
        help="Scale factor for logit_normal timestep sampling (default: 1.0)",
    )
    # Note: --caption_dropout_rate is defined by base add_dataset_arguments().
    # Anima uses embedding-level dropout (via AnimaTextEncodingStrategy.dropout_rate)
    # instead of dataset-level caption dropout, so the subset caption_dropout_rate
    # is zeroed out in the training scripts to allow caching.
    parser.add_argument(
        "--transformer_dtype",
        type=str,
        default=None,
        choices=["float16", "bfloat16", "float32", None],
        help="Separate dtype for transformer blocks. If None, uses same as mixed_precision",
    )
    parser.add_argument(
        "--flash_attn",
        action="store_true",
        help="Use Flash Attention for DiT self/cross-attention (requires flash-attn package). "
        "Falls back to PyTorch SDPA if flash-attn is not installed.",
    )
 # Noise & Timestep sampling (Rectified Flow)
 def get_noisy_model_input_and_timesteps(
    args,
    latents: torch.Tensor,
    noise: torch.Tensor,
    device: torch.device,
    dtype: torch.dtype,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Generate noisy model input and timesteps for rectified flow training.
    Rectified flow: noisy_input = (1 - t) * latents + t * noise
    Target: noise - latents
    Args:
        args: Training arguments with timestep_sample_method, sigmoid_scale, discrete_flow_shift
        latents: Clean latent tensors
        noise: Random noise tensors
        device: Target device
        dtype: Target dtype
    Returns:
        (noisy_model_input, timesteps, sigmas)
    """
    bs = latents.shape[0]
    timestep_sample_method = getattr(args, 'timestep_sample_method', 'logit_normal')
    sigmoid_scale = getattr(args, 'sigmoid_scale', 1.0)
    shift = getattr(args, 'discrete_flow_shift', 1.0)
    if timestep_sample_method == 'logit_normal':
        dist = torch.distributions.normal.Normal(0, 1)
    elif timestep_sample_method == 'uniform':
        dist = torch.distributions.uniform.Uniform(0, 1)
    else:
        raise NotImplementedError(f"Unknown timestep_sample_method: {timestep_sample_method}")
    t = dist.sample((bs,)).to(device)
    if timestep_sample_method == 'logit_normal':
        t = t * sigmoid_scale
        t = torch.sigmoid(t)
    # Apply shift
    if shift is not None and shift != 1.0:
        t = (t * shift) / (1 + (shift - 1) * t)
    # Clamp to avoid exact 0 or 1
    t = t.clamp(1e-5, 1.0 - 1e-5)
    # Create noisy input: (1 - t) * latents + t * noise
    t_expanded = t.view(-1, *([1] * (latents.ndim - 1)))
    ip_noise_gamma = getattr(args, 'ip_noise_gamma', None)
    if ip_noise_gamma:
        xi = torch.randn_like(latents, device=latents.device, dtype=dtype)
        if getattr(args, 'ip_noise_gamma_random_strength', False):
            ip_noise_gamma = torch.rand(1, device=latents.device, dtype=dtype) * ip_noise_gamma
        noisy_model_input = (1 - t_expanded) * latents + t_expanded * (noise + ip_noise_gamma * xi)
    else:
        noisy_model_input = (1 - t_expanded) * latents + t_expanded * noise
    # Sigmas for potential loss weighting
    sigmas = t.view(-1, 1)
    return noisy_model_input.to(dtype), t.to(dtype), sigmas.to(dtype)
 # Loss weighting
 def compute_loss_weighting_for_anima(weighting_scheme: str, sigmas: torch.Tensor) -> torch.Tensor:
    """Compute loss weighting for Anima training.
    Same schemes as SD3 but can add Anima-specific ones.
    """
    if weighting_scheme == "sigma_sqrt":
        weighting = (sigmas**-2.0).float()
    elif weighting_scheme == "cosmap":
        bot = 1 - 2 * sigmas + 2 * sigmas**2
        weighting = 2 / (math.pi * bot)
    elif weighting_scheme == "none" or weighting_scheme is None:
        weighting = torch.ones_like(sigmas)
    else:
        weighting = torch.ones_like(sigmas)
    return weighting
 # Parameter groups (6 groups with separate LRs)
 def get_anima_param_groups(
    dit,
    base_lr: float,
    self_attn_lr: Optional[float] = None,
    cross_attn_lr: Optional[float] = None,
    mlp_lr: Optional[float] = None,
    mod_lr: Optional[float] = None,
    llm_adapter_lr: Optional[float] = None,
 ):
    """Create parameter groups for Anima training with separate learning rates.
    Args:
        dit: MiniTrainDIT model
        base_lr: Base learning rate
        self_attn_lr: LR for self-attention layers (None = base_lr, 0 = freeze)
        cross_attn_lr: LR for cross-attention layers
        mlp_lr: LR for MLP layers
        mod_lr: LR for AdaLN modulation layers
        llm_adapter_lr: LR for LLM adapter
    Returns:
        List of parameter group dicts for optimizer
    """
    if self_attn_lr is None:
        self_attn_lr = base_lr
    if cross_attn_lr is None:
        cross_attn_lr = base_lr
    if mlp_lr is None:
        mlp_lr = base_lr
    if mod_lr is None:
        mod_lr = base_lr
    if llm_adapter_lr is None:
        llm_adapter_lr = base_lr
    base_params = []
    self_attn_params = []
    cross_attn_params = []
    mlp_params = []
    mod_params = []
    llm_adapter_params = []
    for name, p in dit.named_parameters():
        # Store original name for debugging
        p.original_name = name
        if 'llm_adapter' in name:
            llm_adapter_params.append(p)
        elif '.self_attn' in name:
            self_attn_params.append(p)
        elif '.cross_attn' in name:
            cross_attn_params.append(p)
        elif '.mlp' in name:
            mlp_params.append(p)
        elif '.adaln_modulation' in name:
            mod_params.append(p)
        else:
            base_params.append(p)
    logger.info(f"Parameter groups:")
    logger.info(f"  base_params: {len(base_params)} (lr={base_lr})")
    logger.info(f"  self_attn_params: {len(self_attn_params)} (lr={self_attn_lr})")
    logger.info(f"  cross_attn_params: {len(cross_attn_params)} (lr={cross_attn_lr})")
    logger.info(f"  mlp_params: {len(mlp_params)} (lr={mlp_lr})")
    logger.info(f"  mod_params: {len(mod_params)} (lr={mod_lr})")
    logger.info(f"  llm_adapter_params: {len(llm_adapter_params)} (lr={llm_adapter_lr})")
    param_groups = []
    for lr, params, name in [
        (base_lr, base_params, "base"),
        (self_attn_lr, self_attn_params, "self_attn"),
        (cross_attn_lr, cross_attn_params, "cross_attn"),
        (mlp_lr, mlp_params, "mlp"),
        (mod_lr, mod_params, "mod"),
        (llm_adapter_lr, llm_adapter_params, "llm_adapter"),
    ]:
        if lr == 0:
            for p in params:
                p.requires_grad_(False)
            logger.info(f"  Frozen {name} params ({len(params)} parameters)")
        elif len(params) > 0:
            param_groups.append({'params': params, 'lr': lr})
    total_trainable = sum(p.numel() for group in param_groups for p in group['params'] if p.requires_grad)
    logger.info(f"Total trainable parameters: {total_trainable:,}")
    return param_groups
 # Save functions
 def save_anima_model_on_train_end(
    args: argparse.Namespace,
    save_dtype: torch.dtype,
    epoch: int,
    global_step: int,
    dit: anima_models.MiniTrainDIT,
 ):
    """Save Anima model at the end of training."""
    def sd_saver(ckpt_file, epoch_no, global_step):
        sai_metadata = train_util.get_sai_model_spec(
            None, args, False, False, False, is_stable_diffusion_ckpt=True
        )
        dit_sd = dit.state_dict()
        # Save with 'net.' prefix for ComfyUI compatibility
        anima_utils.save_anima_model(ckpt_file, dit_sd, save_dtype)
    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
 def save_anima_model_on_epoch_end_or_stepwise(
    args: argparse.Namespace,
    on_epoch_end: bool,
    accelerator: Accelerator,
    save_dtype: torch.dtype,
    epoch: int,
    num_train_epochs: int,
    global_step: int,
    dit: anima_models.MiniTrainDIT,
 ):
    """Save Anima model at epoch end or specific steps."""
    def sd_saver(ckpt_file, epoch_no, global_step):
        sai_metadata = train_util.get_sai_model_spec(
            None, args, False, False, False, is_stable_diffusion_ckpt=True
        )
        dit_sd = dit.state_dict()
        anima_utils.save_anima_model(ckpt_file, dit_sd, save_dtype)
    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
        args,
        on_epoch_end,
        accelerator,
        True,
        True,
        epoch,
        num_train_epochs,
        global_step,
        sd_saver,
        None,
    )
 # Sampling (Euler discrete for rectified flow)
 def do_sample(
    height: int,
    width: int,
    seed: Optional[int],
    dit: anima_models.MiniTrainDIT,
    crossattn_emb: torch.Tensor,
    steps: int,
    dtype: torch.dtype,
    device: torch.device,
    guidance_scale: float = 1.0,
    neg_crossattn_emb: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    """Generate a sample using Euler discrete sampling for rectified flow.
    Args:
        height, width: Output image dimensions
        seed: Random seed (None for random)
        dit: MiniTrainDIT model
        crossattn_emb: Cross-attention embeddings (B, N, D)
        steps: Number of sampling steps
        dtype: Compute dtype
        device: Compute device
        guidance_scale: CFG scale (1.0 = no guidance)
        neg_crossattn_emb: Negative cross-attention embeddings for CFG
    Returns:
        Denoised latents
    """
    # Latent shape: (1, 16, 1, H/8, W/8) for single image
    latent_h = height // 8
    latent_w = width // 8
    latent = torch.zeros(1, 16, 1, latent_h, latent_w, device=device, dtype=dtype)
    # Generate noise
    if seed is not None:
        generator = torch.manual_seed(seed)
    else:
        generator = None
    noise = torch.randn(
        latent.size(), dtype=torch.float32, generator=generator, device="cpu"
    ).to(dtype).to(device)
    # Timestep schedule: linear from 1.0 to 0.0
    sigmas = torch.linspace(1.0, 0.0, steps + 1, device=device, dtype=dtype)
    # Start from pure noise
    x = noise.clone()
    # Padding mask (zeros = no padding) — resized in prepare_embedded_sequence to match latent dims
    padding_mask = torch.zeros(1, 1, latent_h, latent_w, dtype=dtype, device=device)
    use_cfg = guidance_scale > 1.0 and neg_crossattn_emb is not None
    for i in tqdm(range(steps), desc="Sampling"):
        sigma = sigmas[i]
        t = sigma.unsqueeze(0)  # (1,)
        dit.prepare_block_swap_before_forward()
        if use_cfg:
            # CFG: concat positive and negative
            x_input = torch.cat([x, x], dim=0)
            t_input = torch.cat([t, t], dim=0)
            crossattn_input = torch.cat([crossattn_emb, neg_crossattn_emb], dim=0)
            padding_input = torch.cat([padding_mask, padding_mask], dim=0)
            model_output = dit(x_input, t_input, crossattn_input, padding_mask=padding_input)
            model_output = model_output.float()
            pos_out, neg_out = model_output.chunk(2)
            model_output = neg_out + guidance_scale * (pos_out - neg_out)
        else:
            model_output = dit(x, t, crossattn_emb, padding_mask=padding_mask)
            model_output = model_output.float()
        # Euler step: x_{t-1} = x_t - (sigma_t - sigma_{t-1}) * model_output
        dt = sigmas[i + 1] - sigma
        x = x + model_output * dt
        x = x.to(dtype)
    dit.prepare_block_swap_before_forward()
    return x
 def sample_images(
    accelerator: Accelerator,
    args: argparse.Namespace,
    epoch,
    steps,
    dit,
    vae,
    vae_scale,
    text_encoder,
    tokenize_strategy,
    text_encoding_strategy,
    sample_prompts_te_outputs=None,
    prompt_replacement=None,
 ):
    """Generate sample images during training.
    This is a simplified sampler for Anima - it generates images using the current model state.
    """
    if steps == 0:
        if not args.sample_at_first:
            return
    else:
        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
            return
        if args.sample_every_n_epochs is not None:
            if epoch is None or epoch % args.sample_every_n_epochs != 0:
                return
        else:
            if steps % args.sample_every_n_steps != 0 or epoch is not None:
                return
    logger.info(f"Generating sample images at step {steps}")
    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
        logger.error(f"No prompt file: {args.sample_prompts}")
        return
    # Unwrap models
    dit = accelerator.unwrap_model(dit)
    if text_encoder is not None:
        text_encoder = accelerator.unwrap_model(text_encoder)
    prompts = train_util.load_prompts(args.sample_prompts)
    save_dir = os.path.join(args.output_dir, "sample")
    os.makedirs(save_dir, exist_ok=True)
    # Save RNG state
    rng_state = torch.get_rng_state()
    cuda_rng_state = None
    try:
        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
    except Exception:
        pass
    with torch.no_grad(), accelerator.autocast():
        for prompt_dict in prompts:
            _sample_image_inference(
                accelerator, args, dit, text_encoder, vae, vae_scale,
                tokenize_strategy, text_encoding_strategy,
                save_dir, prompt_dict, epoch, steps,
                sample_prompts_te_outputs, prompt_replacement,
            )
    # Restore RNG state
    torch.set_rng_state(rng_state)
    if cuda_rng_state is not None:
        torch.cuda.set_rng_state(cuda_rng_state)
    clean_memory_on_device(accelerator.device)
 def _sample_image_inference(
    accelerator, args, dit, text_encoder, vae, vae_scale,
    tokenize_strategy, text_encoding_strategy,
    save_dir, prompt_dict, epoch, steps,
    sample_prompts_te_outputs, prompt_replacement,
 ):
    """Generate a single sample image."""
    prompt = prompt_dict.get("prompt", "")
    negative_prompt = prompt_dict.get("negative_prompt", "")
    sample_steps = prompt_dict.get("sample_steps", 30)
    width = prompt_dict.get("width", 512)
    height = prompt_dict.get("height", 512)
    scale = prompt_dict.get("scale", 7.5)
    seed = prompt_dict.get("seed")
    if prompt_replacement is not None:
        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
        if negative_prompt:
            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
    if seed is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # seed all CUDA devices for multi-GPU
    height = max(64, height - height % 16)
    width = max(64, width - width % 16)
    logger.info(f"  prompt: {prompt}, size: {width}x{height}, steps: {sample_steps}, scale: {scale}")
    # Encode prompt
    def encode_prompt(prpt):
        if sample_prompts_te_outputs and prpt in sample_prompts_te_outputs:
            return sample_prompts_te_outputs[prpt]
        if text_encoder is not None:
            tokens = tokenize_strategy.tokenize(prpt)
            encoded = text_encoding_strategy.encode_tokens(tokenize_strategy, [text_encoder], tokens)
            return encoded
        return None
    encoded = encode_prompt(prompt)
    if encoded is None:
        logger.warning("Cannot encode prompt, skipping sample")
        return
    prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = encoded
    # Convert to tensors if numpy
    if isinstance(prompt_embeds, np.ndarray):
        prompt_embeds = torch.from_numpy(prompt_embeds).unsqueeze(0)
        attn_mask = torch.from_numpy(attn_mask).unsqueeze(0)
        t5_input_ids = torch.from_numpy(t5_input_ids).unsqueeze(0)
        t5_attn_mask = torch.from_numpy(t5_attn_mask).unsqueeze(0)
    prompt_embeds = prompt_embeds.to(accelerator.device, dtype=dit.t_embedding_norm.weight.dtype)
    attn_mask = attn_mask.to(accelerator.device)
    t5_input_ids = t5_input_ids.to(accelerator.device, dtype=torch.long)
    t5_attn_mask = t5_attn_mask.to(accelerator.device)
    # Process through LLM adapter if available
    if dit.use_llm_adapter and hasattr(dit, 'llm_adapter'):
        crossattn_emb = dit.llm_adapter(
            source_hidden_states=prompt_embeds,
            target_input_ids=t5_input_ids,
            target_attention_mask=t5_attn_mask,
            source_attention_mask=attn_mask,
        )
        crossattn_emb[~t5_attn_mask.bool()] = 0
    else:
        crossattn_emb = prompt_embeds
    # Encode negative prompt for CFG
    neg_crossattn_emb = None
    if scale > 1.0 and negative_prompt is not None:
        neg_encoded = encode_prompt(negative_prompt)
        if neg_encoded is not None:
            neg_pe, neg_am, neg_t5_ids, neg_t5_am = neg_encoded
            if isinstance(neg_pe, np.ndarray):
                neg_pe = torch.from_numpy(neg_pe).unsqueeze(0)
                neg_am = torch.from_numpy(neg_am).unsqueeze(0)
                neg_t5_ids = torch.from_numpy(neg_t5_ids).unsqueeze(0)
                neg_t5_am = torch.from_numpy(neg_t5_am).unsqueeze(0)
            neg_pe = neg_pe.to(accelerator.device, dtype=dit.t_embedding_norm.weight.dtype)
            neg_am = neg_am.to(accelerator.device)
            neg_t5_ids = neg_t5_ids.to(accelerator.device, dtype=torch.long)
            neg_t5_am = neg_t5_am.to(accelerator.device)
            if dit.use_llm_adapter and hasattr(dit, 'llm_adapter'):
                neg_crossattn_emb = dit.llm_adapter(
                    source_hidden_states=neg_pe,
                    target_input_ids=neg_t5_ids,
                    target_attention_mask=neg_t5_am,
                    source_attention_mask=neg_am,
                )
                neg_crossattn_emb[~neg_t5_am.bool()] = 0
            else:
                neg_crossattn_emb = neg_pe
    # Generate sample
    clean_memory_on_device(accelerator.device)
    latents = do_sample(
        height, width, seed, dit, crossattn_emb,
        sample_steps, dit.t_embedding_norm.weight.dtype,
        accelerator.device, scale, neg_crossattn_emb,
    )
    # Decode latents
    clean_memory_on_device(accelerator.device)
    org_vae_device = next(vae.parameters()).device
    vae.to(accelerator.device)
    decoded = vae.decode(latents.to(next(vae.parameters()).device, dtype=next(vae.parameters()).dtype), vae_scale)
    vae.to(org_vae_device)
    clean_memory_on_device(accelerator.device)
    # Convert to image
    image = decoded.float()
    image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
    # Remove temporal dim if present
    if image.ndim == 4:
        image = image[:, 0, :, :]
    decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
    decoded_np = decoded_np.astype(np.uint8)
    image = Image.fromarray(decoded_np)
    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
    seed_suffix = "" if seed is None else f"_{seed}"
    i = prompt_dict.get("enum", 0)
    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
    image.save(os.path.join(save_dir, img_filename))
    # Log to wandb if enabled
    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
        wandb_tracker = accelerator.get_tracker("wandb")
        import wandb
        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)
--- a/library/anima_utils.py
+++ b/library/anima_utils.py
@@ -0,0 +1,325 @@
 # Anima model loading/saving utilities
 import os
 from typing import Dict, List, Optional, Union
 import torch
 import torch.nn as nn
 from safetensors.torch import load_file, save_file
 from accelerate.utils import set_module_tensor_to_device  # kept for potential future use
 from .utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 from library import anima_models
 # Keys that should stay in high precision (float32/bfloat16, not quantized)
 KEEP_IN_HIGH_PRECISION = ['x_embedder', 't_embedder', 't_embedding_norm', 'final_layer']
 def load_safetensors(path: str, device: str = "cpu", dtype: Optional[torch.dtype] = None) -> Dict[str, torch.Tensor]:
    """Load a safetensors file and optionally cast to dtype."""
    sd = load_file(path, device=device)
    if dtype is not None:
        sd = {k: v.to(dtype) for k, v in sd.items()}
    return sd
 def load_anima_dit(
    dit_path: str,
    dtype: torch.dtype,
    device: Union[str, torch.device] = "cpu",
    transformer_dtype: Optional[torch.dtype] = None,
    llm_adapter_path: Optional[str] = None,
    disable_mmap: bool = False,
 ) -> anima_models.MiniTrainDIT:
    """Load the MiniTrainDIT model from safetensors.
    Args:
        dit_path: Path to DiT safetensors file
        dtype: Base dtype for model parameters
        device: Device to load to
        transformer_dtype: Optional separate dtype for transformer blocks (lower precision)
        llm_adapter_path: Optional separate path for LLM adapter weights
        disable_mmap: If True, disable memory-mapped loading (reduces peak memory)
    """
    if transformer_dtype is None:
        transformer_dtype = dtype
    logger.info(f"Loading Anima DiT from {dit_path}")
    if disable_mmap:
        from library.safetensors_utils import load_safetensors as load_safetensors_no_mmap
        state_dict = load_safetensors_no_mmap(dit_path, device="cpu", disable_mmap=True)
    else:
        state_dict = load_file(dit_path, device="cpu")
    # Remove 'net.' prefix if present
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('net.'):
            k = k[len('net.'):]
        new_state_dict[k] = v
    state_dict = new_state_dict
    # Derive config from state_dict
    dit_config = anima_models.get_dit_config(state_dict)
    # Detect LLM adapter
    if llm_adapter_path is not None:
        use_llm_adapter = True
        dit_config['use_llm_adapter'] = True
        llm_adapter_state_dict = load_safetensors(llm_adapter_path, device="cpu")
    elif 'llm_adapter.out_proj.weight' in state_dict:
        use_llm_adapter = True
        dit_config['use_llm_adapter'] = True
        llm_adapter_state_dict = None  # Loaded as part of DiT
    else:
        use_llm_adapter = False
        llm_adapter_state_dict = None
    logger.info(f"DiT config: model_channels={dit_config['model_channels']}, num_blocks={dit_config['num_blocks']}, "
                f"num_heads={dit_config['num_heads']}, use_llm_adapter={use_llm_adapter}")
    # Build model normally on CPU — buffers get proper values from __init__
    dit = anima_models.MiniTrainDIT(**dit_config)
    # Merge LLM adapter weights into state_dict if loaded separately
    if use_llm_adapter and llm_adapter_state_dict is not None:
        for k, v in llm_adapter_state_dict.items():
            state_dict[f"llm_adapter.{k}"] = v
    # Load checkpoint: strict=False keeps buffers not in checkpoint (e.g. pos_embedder.seq)
    missing, unexpected = dit.load_state_dict(state_dict, strict=False)
    if missing:
        # Filter out expected missing buffers (initialized in __init__, not saved in checkpoint)
        unexpected_missing = [k for k in missing if not any(
            buf_name in k for buf_name in ('seq', 'dim_spatial_range', 'dim_temporal_range', 'inv_freq')
        )]
        if unexpected_missing:
            logger.warning(f"Missing keys in checkpoint: {unexpected_missing[:10]}{'...' if len(unexpected_missing) > 10 else ''}")
    if unexpected:
        logger.info(f"Unexpected keys in checkpoint (ignored): {unexpected[:5]}{'...' if len(unexpected) > 5 else ''}")
    # Apply per-parameter dtype (high precision for 1D/critical, transformer_dtype for rest)
    for name, p in dit.named_parameters():
        dtype_to_use = dtype if (
            any(keyword in name for keyword in KEEP_IN_HIGH_PRECISION) or p.ndim == 1
        ) else transformer_dtype
        p.data = p.data.to(dtype=dtype_to_use)
    dit.to(device)
    logger.info(f"Loaded Anima DiT successfully. Parameters: {sum(p.numel() for p in dit.parameters()):,}")
    return dit
 def load_anima_vae(vae_path: str, dtype: torch.dtype = torch.float32, device: str = "cpu"):
    """Load WanVAE from a safetensors/pth file.
    Returns (vae_model, mean_tensor, std_tensor, scale).
    """
    from library.anima_models import ANIMA_VAE_MEAN, ANIMA_VAE_STD
    logger.info(f"Loading Anima VAE from {vae_path}")
    # VAE config (fixed for WanVAE)
    vae_config = dict(
        dim=96,
        z_dim=16,
        dim_mult=[1, 2, 4, 4],
        num_res_blocks=2,
        attn_scales=[],
        temperal_downsample=[False, True, True],
        dropout=0.0,
    )
    from library.anima_vae import WanVAE_
    # Build model
    with torch.device('meta'):
        vae = WanVAE_(**vae_config)
    # Load state dict
    if vae_path.endswith('.safetensors'):
        vae_sd = load_file(vae_path, device='cpu')
    else:
        vae_sd = torch.load(vae_path, map_location='cpu', weights_only=True)
    vae.load_state_dict(vae_sd, assign=True)
    vae = vae.eval().requires_grad_(False).to(device, dtype=dtype)
    # Create normalization tensors
    mean = torch.tensor(ANIMA_VAE_MEAN, dtype=dtype, device=device)
    std = torch.tensor(ANIMA_VAE_STD, dtype=dtype, device=device)
    scale = [mean, 1.0 / std]
    logger.info(f"Loaded Anima VAE successfully.")
    return vae, mean, std, scale
 def load_qwen3_tokenizer(qwen3_path: str):
    """Load Qwen3 tokenizer only (without the text encoder model).
    Args:
        qwen3_path: Path to either a directory with model files or a safetensors file.
                     If a directory, loads tokenizer from it directly.
                     If a file, uses configs/qwen3_06b/ for tokenizer config.
    Returns:
        tokenizer
    """
    from transformers import AutoTokenizer
    if os.path.isdir(qwen3_path):
        tokenizer = AutoTokenizer.from_pretrained(qwen3_path, local_files_only=True)
    else:
        config_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', 'qwen3_06b')
        if not os.path.exists(config_dir):
            raise FileNotFoundError(
                f"Qwen3 config directory not found at {config_dir}. "
                "Expected configs/qwen3_06b/ with config.json, tokenizer.json, etc. "
                "You can download these from the Qwen3-0.6B HuggingFace repository."
            )
        tokenizer = AutoTokenizer.from_pretrained(config_dir, local_files_only=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer
 def load_qwen3_text_encoder(qwen3_path: str, dtype: torch.dtype = torch.bfloat16, device: str = "cpu"):
    """Load Qwen3-0.6B text encoder.
    Args:
        qwen3_path: Path to either a directory with model files or a safetensors file
        dtype: Model dtype
        device: Device to load to
    Returns:
        (text_encoder_model, tokenizer)
    """
    import transformers
    from transformers import AutoTokenizer
    logger.info(f"Loading Qwen3 text encoder from {qwen3_path}")
    if os.path.isdir(qwen3_path):
        # Directory with full model
        tokenizer = AutoTokenizer.from_pretrained(qwen3_path, local_files_only=True)
        model = transformers.AutoModelForCausalLM.from_pretrained(
            qwen3_path, torch_dtype=dtype, local_files_only=True
        ).model
    else:
        # Single safetensors file - use configs/qwen3_06b/ for config
        config_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', 'qwen3_06b')
        if not os.path.exists(config_dir):
            raise FileNotFoundError(
                f"Qwen3 config directory not found at {config_dir}. "
                "Expected configs/qwen3_06b/ with config.json, tokenizer.json, etc. "
                "You can download these from the Qwen3-0.6B HuggingFace repository."
            )
        tokenizer = AutoTokenizer.from_pretrained(config_dir, local_files_only=True)
        qwen3_config = transformers.Qwen3Config.from_pretrained(config_dir, local_files_only=True)
        model = transformers.Qwen3ForCausalLM(qwen3_config).model
        # Load weights
        if qwen3_path.endswith('.safetensors'):
            state_dict = load_file(qwen3_path, device='cpu')
        else:
            state_dict = torch.load(qwen3_path, map_location='cpu', weights_only=True)
        # Remove 'model.' prefix if present
        new_sd = {}
        for k, v in state_dict.items():
            if k.startswith('model.'):
                new_sd[k[len('model.'):]] = v
            else:
                new_sd[k] = v
        info = model.load_state_dict(new_sd, strict=False)
        logger.info(f"Loaded Qwen3 state dict: {info}")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.use_cache = False
    model = model.requires_grad_(False).to(device, dtype=dtype)
    logger.info(f"Loaded Qwen3 text encoder. Parameters: {sum(p.numel() for p in model.parameters()):,}")
    return model, tokenizer
 def load_t5_tokenizer(t5_tokenizer_path: Optional[str] = None):
    """Load T5 tokenizer for LLM Adapter target tokens.
    Args:
        t5_tokenizer_path: Optional path to T5 tokenizer directory. If None, uses default configs.
    """
    from transformers import T5TokenizerFast
    if t5_tokenizer_path is not None:
        return T5TokenizerFast.from_pretrained(t5_tokenizer_path, local_files_only=True)
    # Use bundled config
    config_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', 't5_old')
    if os.path.exists(config_dir):
        return T5TokenizerFast(
            vocab_file=os.path.join(config_dir, 'spiece.model'),
            tokenizer_file=os.path.join(config_dir, 'tokenizer.json'),
        )
    raise FileNotFoundError(
        f"T5 tokenizer config directory not found at {config_dir}. "
        "Expected configs/t5_old/ with spiece.model and tokenizer.json. "
        "You can download these from the google/t5-v1_1-xxl HuggingFace repository."
    )
 def save_anima_model(save_path: str, dit_state_dict: Dict[str, torch.Tensor], dtype: Optional[torch.dtype] = None):
    """Save Anima DiT model with 'net.' prefix for ComfyUI compatibility.
    Args:
        save_path: Output path (.safetensors)
        dit_state_dict: State dict from dit.state_dict()
        dtype: Optional dtype to cast to before saving
    """
    prefixed_sd = {}
    for k, v in dit_state_dict.items():
        if dtype is not None:
            v = v.to(dtype)
        prefixed_sd['net.' + k] = v.contiguous()
    save_file(prefixed_sd, save_path, metadata={'format': 'pt'})
    logger.info(f"Saved Anima model to {save_path}")
 def vae_encode(tensor: torch.Tensor, vae, scale):
    """Encode tensor through WanVAE with normalization.
    Args:
        tensor: Input tensor (B, C, T, H, W) in [-1, 1] range
        vae: WanVAE_ model
        scale: [mean, 1/std] list
    Returns:
        Normalized latents
    """
    return vae.encode(tensor, scale)
 def vae_decode(latents: torch.Tensor, vae, scale):
    """Decode latents through WanVAE with denormalization.
    Args:
        latents: Normalized latents
        vae: WanVAE_ model
        scale: [mean, 1/std] list
    Returns:
        Decoded tensor in [-1, 1] range
    """
    return vae.decode(latents, scale)
--- a/library/anima_vae.py
+++ b/library/anima_vae.py
@@ -0,0 +1,577 @@
 import logging
 import torch
 import torch.cuda.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 CACHE_T = 2
 class CausalConv3d(nn.Conv3d):
    """
    Causal 3d convolusion.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._padding = (self.padding[2], self.padding[2], self.padding[1],
                         self.padding[1], 2 * self.padding[0], 0)
        self.padding = (0, 0, 0)
    def forward(self, x, cache_x=None):
        padding = list(self._padding)
        if cache_x is not None and self._padding[4] > 0:
            cache_x = cache_x.to(x.device)
            x = torch.cat([cache_x, x], dim=2)
            padding[4] -= cache_x.shape[2]
        x = F.pad(x, padding)
        return super().forward(x)
 class RMS_norm(nn.Module):
    def __init__(self, dim, channel_first=True, images=True, bias=False):
        super().__init__()
        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
        self.channel_first = channel_first
        self.scale = dim**0.5
        self.gamma = nn.Parameter(torch.ones(shape))
        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
    def forward(self, x):
        return F.normalize(
            x, dim=(1 if self.channel_first else
                    -1)) * self.scale * self.gamma + self.bias
 class Upsample(nn.Upsample):
    def forward(self, x):
        """
        Fix bfloat16 support for nearest neighbor interpolation.
        """
        return super().forward(x.float()).type_as(x)
 class Resample(nn.Module):
    def __init__(self, dim, mode):
        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
                        'downsample3d')
        super().__init__()
        self.dim = dim
        self.mode = mode
        # layers
        if mode == 'upsample2d':
            self.resample = nn.Sequential(
                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                nn.Conv2d(dim, dim // 2, 3, padding=1))
        elif mode == 'upsample3d':
            self.resample = nn.Sequential(
                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                nn.Conv2d(dim, dim // 2, 3, padding=1))
            self.time_conv = CausalConv3d(
                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
        elif mode == 'downsample2d':
            self.resample = nn.Sequential(
                nn.ZeroPad2d((0, 1, 0, 1)),
                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
        elif mode == 'downsample3d':
            self.resample = nn.Sequential(
                nn.ZeroPad2d((0, 1, 0, 1)),
                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
            self.time_conv = CausalConv3d(
                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
        else:
            self.resample = nn.Identity()
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        b, c, t, h, w = x.size()
        if self.mode == 'upsample3d':
            if feat_cache is not None:
                idx = feat_idx[0]
                if feat_cache[idx] is None:
                    feat_cache[idx] = 'Rep'
                    feat_idx[0] += 1
                else:
                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
                    if cache_x.shape[2] < 2 and feat_cache[
                            idx] is not None and feat_cache[idx] != 'Rep':
                        # cache last frame of last two chunk
                        cache_x = torch.cat([
                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                                cache_x.device), cache_x
                        ],
                                            dim=2)
                    if cache_x.shape[2] < 2 and feat_cache[
                            idx] is not None and feat_cache[idx] == 'Rep':
                        cache_x = torch.cat([
                            torch.zeros_like(cache_x).to(cache_x.device),
                            cache_x
                        ],
                                            dim=2)
                    if feat_cache[idx] == 'Rep':
                        x = self.time_conv(x)
                    else:
                        x = self.time_conv(x, feat_cache[idx])
                    feat_cache[idx] = cache_x
                    feat_idx[0] += 1
                    x = x.reshape(b, 2, c, t, h, w)
                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
                                    3)
                    x = x.reshape(b, c, t * 2, h, w)
        t = x.shape[2]
        x = rearrange(x, 'b c t h w -> (b t) c h w')
        x = self.resample(x)
        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
        if self.mode == 'downsample3d':
            if feat_cache is not None:
                idx = feat_idx[0]
                if feat_cache[idx] is None:
                    feat_cache[idx] = x.clone()
                    feat_idx[0] += 1
                else:
                    cache_x = x[:, :, -1:, :, :].clone()
                    x = self.time_conv(
                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
                    feat_cache[idx] = cache_x
                    feat_idx[0] += 1
        return x
    def init_weight(self, conv):
        conv_weight = conv.weight
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        one_matrix = torch.eye(c1, c2)
        init_matrix = one_matrix
        nn.init.zeros_(conv_weight)
        conv_weight.data[:, :, 1, 0, 0] = init_matrix
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
    def init_weight2(self, conv):
        conv_weight = conv.weight.data
        nn.init.zeros_(conv_weight)
        c1, c2, t, h, w = conv_weight.size()
        init_matrix = torch.eye(c1 // 2, c2)
        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
        conv.weight.data.copy_(conv_weight)
        nn.init.zeros_(conv.bias.data)
 class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.0):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        # layers
        self.residual = nn.Sequential(
            RMS_norm(in_dim, images=False), nn.SiLU(),
            CausalConv3d(in_dim, out_dim, 3, padding=1),
            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
            CausalConv3d(out_dim, out_dim, 3, padding=1))
        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
            if in_dim != out_dim else nn.Identity()
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        h = self.shortcut(x)
        for layer in self.residual:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    # cache last frame of last two chunk
                    cache_x = torch.cat([
                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                            cache_x.device), cache_x
                    ],
                                        dim=2)
                x = layer(x, feat_cache[idx])
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x + h
 class AttentionBlock(nn.Module):
    """
    Causal self-attention with a single head.
    """
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        # layers
        self.norm = RMS_norm(dim)
        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
        self.proj = nn.Conv2d(dim, dim, 1)
        # zero out the last layer params
        nn.init.zeros_(self.proj.weight)
    def forward(self, x):
        identity = x
        b, c, t, h, w = x.size()
        x = rearrange(x, 'b c t h w -> (b t) c h w')
        x = self.norm(x)
        # compute query, key, value
        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3,
                                         -1).permute(0, 1, 3,
                                                     2).contiguous().chunk(
                                                         3, dim=-1)
        # apply attention
        x = F.scaled_dot_product_attention(
            q,
            k,
            v,
        )
        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
        # output
        x = self.proj(x)
        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
        return x + identity
 class Encoder3d(nn.Module):
    def __init__(self,
                 dim=128,
                 z_dim=4,
                 dim_mult=[1, 2, 4, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_downsample=[True, True, False],
                 dropout=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_downsample = temperal_downsample
        # dimensions
        dims = [dim * u for u in [1] + dim_mult]
        scale = 1.0
        # init block
        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
        # downsample blocks
        downsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            # residual (+attention) blocks
            for _ in range(num_res_blocks):
                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
                if scale in attn_scales:
                    downsamples.append(AttentionBlock(out_dim))
                in_dim = out_dim
            # downsample block
            if i != len(dim_mult) - 1:
                mode = 'downsample3d' if temperal_downsample[
                    i] else 'downsample2d'
                downsamples.append(Resample(out_dim, mode=mode))
                scale /= 2.0
        self.downsamples = nn.Sequential(*downsamples)
        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
            ResidualBlock(out_dim, out_dim, dropout))
        # output blocks
        self.head = nn.Sequential(
            RMS_norm(out_dim, images=False), nn.SiLU(),
            CausalConv3d(out_dim, z_dim, 3, padding=1))
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        if feat_cache is not None:
            idx = feat_idx[0]
            cache_x = x[:, :, -CACHE_T:, :, :].clone()
            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                # cache last frame of last two chunk
                cache_x = torch.cat([
                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                        cache_x.device), cache_x
                ],
                                    dim=2)
            x = self.conv1(x, feat_cache[idx])
            feat_cache[idx] = cache_x
            feat_idx[0] += 1
        else:
            x = self.conv1(x)
        ## downsamples
        for layer in self.downsamples:
            if feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## middle
        for layer in self.middle:
            if isinstance(layer, ResidualBlock) and feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## head
        for layer in self.head:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    # cache last frame of last two chunk
                    cache_x = torch.cat([
                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                            cache_x.device), cache_x
                    ],
                                        dim=2)
                x = layer(x, feat_cache[idx])
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x
 class Decoder3d(nn.Module):
    def __init__(self,
                 dim=128,
                 z_dim=4,
                 dim_mult=[1, 2, 4, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_upsample=[False, True, True],
                 dropout=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_upsample = temperal_upsample
        # dimensions
        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
        scale = 1.0 / 2**(len(dim_mult) - 2)
        # init block
        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
            ResidualBlock(dims[0], dims[0], dropout))
        # upsample blocks
        upsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            # residual (+attention) blocks
            if i == 1 or i == 2 or i == 3:
                in_dim = in_dim // 2
            for _ in range(num_res_blocks + 1):
                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
                if scale in attn_scales:
                    upsamples.append(AttentionBlock(out_dim))
                in_dim = out_dim
            # upsample block
            if i != len(dim_mult) - 1:
                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
                upsamples.append(Resample(out_dim, mode=mode))
                scale *= 2.0
        self.upsamples = nn.Sequential(*upsamples)
        # output blocks
        self.head = nn.Sequential(
            RMS_norm(out_dim, images=False), nn.SiLU(),
            CausalConv3d(out_dim, 3, 3, padding=1))
    def forward(self, x, feat_cache=None, feat_idx=[0]):
        ## conv1
        if feat_cache is not None:
            idx = feat_idx[0]
            cache_x = x[:, :, -CACHE_T:, :, :].clone()
            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                # cache last frame of last two chunk
                cache_x = torch.cat([
                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                        cache_x.device), cache_x
                ],
                                    dim=2)
            x = self.conv1(x, feat_cache[idx])
            feat_cache[idx] = cache_x
            feat_idx[0] += 1
        else:
            x = self.conv1(x)
        ## middle
        for layer in self.middle:
            if isinstance(layer, ResidualBlock) and feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## upsamples
        for layer in self.upsamples:
            if feat_cache is not None:
                x = layer(x, feat_cache, feat_idx)
            else:
                x = layer(x)
        ## head
        for layer in self.head:
            if isinstance(layer, CausalConv3d) and feat_cache is not None:
                idx = feat_idx[0]
                cache_x = x[:, :, -CACHE_T:, :, :].clone()
                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
                    # cache last frame of last two chunk
                    cache_x = torch.cat([
                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
                            cache_x.device), cache_x
                    ],
                                        dim=2)
                x = layer(x, feat_cache[idx])
                feat_cache[idx] = cache_x
                feat_idx[0] += 1
            else:
                x = layer(x)
        return x
 def count_conv3d(model):
    count = 0
    for m in model.modules():
        if isinstance(m, CausalConv3d):
            count += 1
    return count
 class WanVAE_(nn.Module):
    def __init__(self,
                 dim=128,
                 z_dim=4,
                 dim_mult=[1, 2, 4, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_downsample=[True, True, False],
                 dropout=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.temperal_downsample = temperal_downsample
        self.temperal_upsample = temperal_downsample[::-1]
        # modules
        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
                                 attn_scales, self.temperal_downsample, dropout)
        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
                                 attn_scales, self.temperal_upsample, dropout)
    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x_recon = self.decode(z)
        return x_recon, mu, log_var
    def encode(self, x, scale):
        self.clear_cache()
        ## cache
        t = x.shape[2]
        iter_ = 1 + (t - 1) // 4
        for i in range(iter_):
            self._enc_conv_idx = [0]
            if i == 0:
                out = self.encoder(
                    x[:, :, :1, :, :],
                    feat_cache=self._enc_feat_map,
                    feat_idx=self._enc_conv_idx)
            else:
                out_ = self.encoder(
                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
                    feat_cache=self._enc_feat_map,
                    feat_idx=self._enc_conv_idx)
                out = torch.cat([out, out_], 2)
        mu, log_var = self.conv1(out).chunk(2, dim=1)
        if isinstance(scale[0], torch.Tensor):
            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
                1, self.z_dim, 1, 1, 1)
        else:
            mu = (mu - scale[0]) * scale[1]
        self.clear_cache()
        return mu
    def decode(self, z, scale):
        self.clear_cache()
        # z: [b,c,t,h,w]
        if isinstance(scale[0], torch.Tensor):
            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
                1, self.z_dim, 1, 1, 1)
        else:
            z = z / scale[1] + scale[0]
        iter_ = z.shape[2]
        x = self.conv2(z)
        for i in range(iter_):
            self._conv_idx = [0]
            if i == 0:
                out = self.decoder(
                    x[:, :, i:i + 1, :, :],
                    feat_cache=self._feat_map,
                    feat_idx=self._conv_idx)
            else:
                out_ = self.decoder(
                    x[:, :, i:i + 1, :, :],
                    feat_cache=self._feat_map,
                    feat_idx=self._conv_idx)
                out = torch.cat([out, out_], 2)
        self.clear_cache()
        return out
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps * std + mu
    def sample(self, imgs, deterministic=False):
        mu, log_var = self.encode(imgs)
        if deterministic:
            return mu
        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
        return mu + std * torch.randn_like(std)
    def clear_cache(self):
        self._conv_num = count_conv3d(self.decoder)
        self._conv_idx = [0]
        self._feat_map = [None] * self._conv_num
        #cache encode
        self._enc_conv_num = count_conv3d(self.encoder)
        self._enc_conv_idx = [0]
        self._enc_feat_map = [None] * self._enc_conv_num
--- a/library/strategy_anima.py
+++ b/library/strategy_anima.py
@@ -0,0 +1,429 @@
 # Anima Strategy Classes
 import os
 import random
 from typing import Any, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from library import anima_utils, train_util
 from library.strategy_base import LatentsCachingStrategy, TextEncodingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy
 from library.utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 class AnimaTokenizeStrategy(TokenizeStrategy):
    """Tokenize strategy for Anima: dual tokenization with Qwen3 + T5.
    Qwen3 tokens are used for the text encoder.
    T5 tokens are used as target input IDs for the LLM Adapter (NOT encoded by T5).
    Can be initialized with either pre-loaded tokenizer objects or paths to load from.
    """
    def __init__(
        self,
        qwen3_tokenizer=None,
        t5_tokenizer=None,
        qwen3_max_length: int = 512,
        t5_max_length: int = 512,
        qwen3_path: Optional[str] = None,
        t5_tokenizer_path: Optional[str] = None,
    ) -> None:
        # Load tokenizers from paths if not provided directly
        if qwen3_tokenizer is None:
            if qwen3_path is None:
                raise ValueError("Either qwen3_tokenizer or qwen3_path must be provided")
            qwen3_tokenizer = anima_utils.load_qwen3_tokenizer(qwen3_path)
        if t5_tokenizer is None:
            t5_tokenizer = anima_utils.load_t5_tokenizer(t5_tokenizer_path)
        self.qwen3_tokenizer = qwen3_tokenizer
        self.t5_tokenizer = t5_tokenizer
        self.qwen3_max_length = qwen3_max_length
        self.t5_max_length = t5_max_length
    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
        text = [text] if isinstance(text, str) else text
        # Tokenize with Qwen3
        qwen3_encoding = self.qwen3_tokenizer.batch_encode_plus(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.qwen3_max_length,
        )
        qwen3_input_ids = qwen3_encoding["input_ids"]
        qwen3_attn_mask = qwen3_encoding["attention_mask"]
        # Tokenize with T5 (for LLM Adapter target tokens)
        t5_encoding = self.t5_tokenizer.batch_encode_plus(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.t5_max_length,
        )
        t5_input_ids = t5_encoding["input_ids"]
        t5_attn_mask = t5_encoding["attention_mask"]
        return [qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask]
 class AnimaTextEncodingStrategy(TextEncodingStrategy):
    """Text encoding strategy for Anima.
    Encodes Qwen3 tokens through the Qwen3 text encoder to get hidden states.
    T5 tokens are passed through unchanged (only used by LLM Adapter).
    """
    def __init__(
        self,
        dropout_rate: float = 0.0,
    ) -> None:
        self.dropout_rate = dropout_rate
        # Cached unconditional embeddings (from encoding empty caption "")
        # Must be initialized via cache_uncond_embeddings() before text encoder is deleted
        self._uncond_prompt_embeds: Optional[torch.Tensor] = None  # (1, seq_len, hidden)
        self._uncond_attn_mask: Optional[torch.Tensor] = None      # (1, seq_len)
        self._uncond_t5_input_ids: Optional[torch.Tensor] = None   # (1, t5_seq_len)
        self._uncond_t5_attn_mask: Optional[torch.Tensor] = None   # (1, t5_seq_len)
    def cache_uncond_embeddings(
        self,
        tokenize_strategy: TokenizeStrategy,
        models: List[Any],
    ) -> None:
        """Pre-encode empty caption "" and cache the unconditional embeddings.
        Must be called before the text encoder is deleted from GPU.
        This matches diffusion-pipe-main behavior where empty caption embeddings
        are pre-cached and swapped in during caption dropout.
        """
        logger.info("Caching unconditional embeddings for caption dropout (encoding empty caption)...")
        tokens = tokenize_strategy.tokenize("")
        with torch.no_grad():
            uncond_outputs = self.encode_tokens(tokenize_strategy, models, tokens, enable_dropout=False)
        # Store as CPU tensors (1, seq_len, ...) to avoid GPU memory waste
        self._uncond_prompt_embeds = uncond_outputs[0].cpu()
        self._uncond_attn_mask = uncond_outputs[1].cpu()
        self._uncond_t5_input_ids = uncond_outputs[2].cpu()
        self._uncond_t5_attn_mask = uncond_outputs[3].cpu()
        logger.info("  Unconditional embeddings cached successfully")
    def encode_tokens(
        self,
        tokenize_strategy: TokenizeStrategy,
        models: List[Any],
        tokens: List[torch.Tensor],
        enable_dropout: bool = True,
    ) -> List[torch.Tensor]:
        """Encode Qwen3 tokens and return embeddings + T5 token IDs.
        Args:
            models: [qwen3_text_encoder]
            tokens: [qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask]
        Returns:
            [prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask]
        """
        qwen3_text_encoder = models[0]
        qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask = tokens
        # Handle dropout: replace dropped items with unconditional embeddings (matching diffusion-pipe-main)
        batch_size = qwen3_input_ids.shape[0]
        non_drop_indices = []
        for i in range(batch_size):
            drop = enable_dropout and (self.dropout_rate > 0.0 and random.random() < self.dropout_rate)
            if not drop:
                non_drop_indices.append(i)
        encoder_device = qwen3_text_encoder.device if hasattr(qwen3_text_encoder, 'device') else next(qwen3_text_encoder.parameters()).device
        if len(non_drop_indices) > 0 and len(non_drop_indices) < batch_size:
            # Only encode non-dropped items to save compute
            nd_input_ids = qwen3_input_ids[non_drop_indices].to(encoder_device)
            nd_attn_mask = qwen3_attn_mask[non_drop_indices].to(encoder_device)
        elif len(non_drop_indices) == batch_size:
            nd_input_ids = qwen3_input_ids.to(encoder_device)
            nd_attn_mask = qwen3_attn_mask.to(encoder_device)
        else:
            nd_input_ids = None
            nd_attn_mask = None
        if nd_input_ids is not None:
            outputs = qwen3_text_encoder(input_ids=nd_input_ids, attention_mask=nd_attn_mask)
            nd_encoded_text = outputs.last_hidden_state
            # Zero out padding positions
            nd_encoded_text[~nd_attn_mask.bool()] = 0
        # Build full batch: fill non-dropped with encoded, dropped with unconditional
        if len(non_drop_indices) == batch_size:
            prompt_embeds = nd_encoded_text
            attn_mask = qwen3_attn_mask.to(encoder_device)
        else:
            # Get unconditional embeddings
            if self._uncond_prompt_embeds is not None:
                uncond_pe = self._uncond_prompt_embeds[0]
                uncond_am = self._uncond_attn_mask[0]
                uncond_t5_ids = self._uncond_t5_input_ids[0]
                uncond_t5_am = self._uncond_t5_attn_mask[0]
            else:
                # Encode empty caption on-the-fly (text encoder still available)
                uncond_tokens = tokenize_strategy.tokenize("")
                uncond_ids = uncond_tokens[0].to(encoder_device)
                uncond_mask = uncond_tokens[1].to(encoder_device)
                uncond_out = qwen3_text_encoder(input_ids=uncond_ids, attention_mask=uncond_mask)
                uncond_pe = uncond_out.last_hidden_state[0]
                uncond_pe[~uncond_mask[0].bool()] = 0
                uncond_am = uncond_mask[0]
                uncond_t5_ids = uncond_tokens[2][0]
                uncond_t5_am = uncond_tokens[3][0]
            seq_len = qwen3_input_ids.shape[1]
            hidden_size = nd_encoded_text.shape[-1] if nd_encoded_text is not None else uncond_pe.shape[-1]
            dtype = nd_encoded_text.dtype if nd_encoded_text is not None else uncond_pe.dtype
            prompt_embeds = torch.zeros((batch_size, seq_len, hidden_size), device=encoder_device, dtype=dtype)
            attn_mask = torch.zeros((batch_size, seq_len), device=encoder_device, dtype=qwen3_attn_mask.dtype)
            if len(non_drop_indices) > 0:
                prompt_embeds[non_drop_indices] = nd_encoded_text
                attn_mask[non_drop_indices] = nd_attn_mask
            # Fill dropped items with unconditional embeddings
            t5_input_ids = t5_input_ids.clone()
            t5_attn_mask = t5_attn_mask.clone()
            drop_indices = [i for i in range(batch_size) if i not in non_drop_indices]
            for i in drop_indices:
                prompt_embeds[i] = uncond_pe.to(device=encoder_device, dtype=dtype)
                attn_mask[i] = uncond_am.to(device=encoder_device, dtype=qwen3_attn_mask.dtype)
                t5_input_ids[i] = uncond_t5_ids.to(device=t5_input_ids.device, dtype=t5_input_ids.dtype)
                t5_attn_mask[i] = uncond_t5_am.to(device=t5_attn_mask.device, dtype=t5_attn_mask.dtype)
        return [prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask]
    def drop_cached_text_encoder_outputs(
        self,
        prompt_embeds: torch.Tensor,
        attn_mask: torch.Tensor,
        t5_input_ids: torch.Tensor,
        t5_attn_mask: torch.Tensor,
    ) -> List[torch.Tensor]:
        """Apply dropout to cached text encoder outputs.
        Called during training when using cached outputs.
        Replaces dropped items with pre-cached unconditional embeddings (from encoding "")
        to match diffusion-pipe-main behavior.
        """
        if prompt_embeds is not None and self.dropout_rate > 0.0:
            # Clone to avoid in-place modification of cached tensors
            prompt_embeds = prompt_embeds.clone()
            if attn_mask is not None:
                attn_mask = attn_mask.clone()
            if t5_input_ids is not None:
                t5_input_ids = t5_input_ids.clone()
            if t5_attn_mask is not None:
                t5_attn_mask = t5_attn_mask.clone()
            for i in range(prompt_embeds.shape[0]):
                if random.random() < self.dropout_rate:
                    if self._uncond_prompt_embeds is not None:
                        # Use pre-cached unconditional embeddings
                        prompt_embeds[i] = self._uncond_prompt_embeds[0].to(device=prompt_embeds.device, dtype=prompt_embeds.dtype)
                        if attn_mask is not None:
                            attn_mask[i] = self._uncond_attn_mask[0].to(device=attn_mask.device, dtype=attn_mask.dtype)
                        if t5_input_ids is not None:
                            t5_input_ids[i] = self._uncond_t5_input_ids[0].to(device=t5_input_ids.device, dtype=t5_input_ids.dtype)
                        if t5_attn_mask is not None:
                            t5_attn_mask[i] = self._uncond_t5_attn_mask[0].to(device=t5_attn_mask.device, dtype=t5_attn_mask.dtype)
                    else:
                        # Fallback: zero out (should not happen if cache_uncond_embeddings was called)
                        logger.warning("Unconditional embeddings not cached, falling back to zeros for caption dropout")
                        prompt_embeds[i] = torch.zeros_like(prompt_embeds[i])
                        if attn_mask is not None:
                            attn_mask[i] = torch.zeros_like(attn_mask[i])
                        if t5_input_ids is not None:
                            t5_input_ids[i] = torch.zeros_like(t5_input_ids[i])
                        if t5_attn_mask is not None:
                            t5_attn_mask[i] = torch.zeros_like(t5_attn_mask[i])
        return [prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask]
 class AnimaTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
    """Caching strategy for Anima text encoder outputs.
    Caches: prompt_embeds (float), attn_mask (int), t5_input_ids (int), t5_attn_mask (int)
    """
    ANIMA_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_anima_te.npz"
    def __init__(
        self,
        cache_to_disk: bool,
        batch_size: int,
        skip_disk_cache_validity_check: bool,
        is_partial: bool = False,
    ) -> None:
        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
    def get_outputs_npz_path(self, image_abs_path: str) -> str:
        return os.path.splitext(image_abs_path)[0] + self.ANIMA_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
    def is_disk_cached_outputs_expected(self, npz_path: str) -> bool:
        if not self.cache_to_disk:
            return False
        if not os.path.exists(npz_path):
            return False
        if self.skip_disk_cache_validity_check:
            return True
        try:
            npz = np.load(npz_path)
            if "prompt_embeds" not in npz:
                return False
            if "attn_mask" not in npz:
                return False
            if "t5_input_ids" not in npz:
                return False
            if "t5_attn_mask" not in npz:
                return False
        except Exception as e:
            logger.error(f"Error loading file: {npz_path}")
            raise e
        return True
    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
        data = np.load(npz_path)
        prompt_embeds = data["prompt_embeds"]
        attn_mask = data["attn_mask"]
        t5_input_ids = data["t5_input_ids"]
        t5_attn_mask = data["t5_attn_mask"]
        return [prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask]
    def cache_batch_outputs(
        self,
        tokenize_strategy: TokenizeStrategy,
        models: List[Any],
        text_encoding_strategy: TextEncodingStrategy,
        infos: List,
    ):
        anima_text_encoding_strategy: AnimaTextEncodingStrategy = text_encoding_strategy
        captions = [info.caption for info in infos]
        tokens_and_masks = tokenize_strategy.tokenize(captions)
        with torch.no_grad():
            # Always disable dropout during caching
            prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = anima_text_encoding_strategy.encode_tokens(
                tokenize_strategy,
                models,
                tokens_and_masks,
                enable_dropout=False,
            )
        # Convert to numpy for caching
        if prompt_embeds.dtype == torch.bfloat16:
            prompt_embeds = prompt_embeds.float()
        prompt_embeds = prompt_embeds.cpu().numpy()
        attn_mask = attn_mask.cpu().numpy()
        t5_input_ids = t5_input_ids.cpu().numpy().astype(np.int32)
        t5_attn_mask = t5_attn_mask.cpu().numpy().astype(np.int32)
        for i, info in enumerate(infos):
            prompt_embeds_i = prompt_embeds[i]
            attn_mask_i = attn_mask[i]
            t5_input_ids_i = t5_input_ids[i]
            t5_attn_mask_i = t5_attn_mask[i]
            if self.cache_to_disk:
                np.savez(
                    info.text_encoder_outputs_npz,
                    prompt_embeds=prompt_embeds_i,
                    attn_mask=attn_mask_i,
                    t5_input_ids=t5_input_ids_i,
                    t5_attn_mask=t5_attn_mask_i,
                )
            else:
                info.text_encoder_outputs = (prompt_embeds_i, attn_mask_i, t5_input_ids_i, t5_attn_mask_i)
 class AnimaLatentsCachingStrategy(LatentsCachingStrategy):
    """Latent caching strategy for Anima using WanVAE.
    WanVAE produces 16-channel latents with spatial downscale 8x.
    Latent shape for images: (B, 16, 1, H/8, W/8)
    """
    ANIMA_LATENTS_NPZ_SUFFIX = "_anima.npz"
    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
    @property
    def cache_suffix(self) -> str:
        return self.ANIMA_LATENTS_NPZ_SUFFIX
    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
        return (
            os.path.splitext(absolute_path)[0]
            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
            + self.ANIMA_LATENTS_NPZ_SUFFIX
        )
    def is_disk_cached_latents_expected(
        self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
    ):
        return self._default_is_disk_cached_latents_expected(
            8, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True
        )
    def load_latents_from_disk(
        self, npz_path: str, bucket_reso: Tuple[int, int]
    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
        return self._default_load_latents_from_disk(8, npz_path, bucket_reso)
    def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
        """Cache batch of latents using WanVAE.
        vae is expected to be the WanVAE_ model (not the wrapper).
        The encoding function handles the mean/std normalization.
        """
        from library.anima_models import ANIMA_VAE_MEAN, ANIMA_VAE_STD
        vae_device = next(vae.parameters()).device
        vae_dtype = next(vae.parameters()).dtype
        # Create scale tensors on VAE device
        mean = torch.tensor(ANIMA_VAE_MEAN, dtype=vae_dtype, device=vae_device)
        std = torch.tensor(ANIMA_VAE_STD, dtype=vae_dtype, device=vae_device)
        scale = [mean, 1.0 / std]
        def encode_by_vae(img_tensor):
            """Encode image tensor to latents.
            img_tensor: (B, C, H, W) in [-1, 1] range (already normalized by IMAGE_TRANSFORMS)
            Need to add temporal dim to get (B, C, T=1, H, W) for WanVAE
            """
            # Add temporal dimension: (B, C, H, W) -> (B, C, 1, H, W)
            img_tensor = img_tensor.unsqueeze(2)
            img_tensor = img_tensor.to(vae_device, dtype=vae_dtype)
            latents = vae.encode(img_tensor, scale)
            return latents.to("cpu")
        self._default_cache_batch_latents(
            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, multi_resolution=True
        )
        if not train_util.HIGH_VRAM:
            train_util.clean_memory_on_device(vae_device)
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -524,7 +524,7 @@ class LatentsCachingStrategy:
            original_size = original_sizes[i]
            crop_ltrb = crop_ltrbs[i]
-            latents_size = latents.shape[1:3]  # H, W
+            latents_size = latents.shape[-2:]  # H, W (supports both 4D and 5D latents)
            key_reso_suffix = f"_{latents_size[0]}x{latents_size[1]}" if multi_resolution else ""  # e.g. "_32x64", HxW
            if self.cache_to_disk:
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6138,7 +6138,8 @@ def conditional_loss(
    elif loss_type == "huber":
        if huber_c is None:
            raise NotImplementedError("huber_c not implemented correctly")
-        huber_c = huber_c.view(-1, 1, 1, 1)
+        # Reshape huber_c to broadcast with model_pred (supports 4D and 5D tensors)
        huber_c = huber_c.view(-1, *([1] * (model_pred.ndim - 1)))
        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
        if reduction == "mean":
            loss = torch.mean(loss)
@@ -6147,7 +6148,8 @@ def conditional_loss(
    elif loss_type == "smooth_l1":
        if huber_c is None:
            raise NotImplementedError("huber_c not implemented correctly")
-        huber_c = huber_c.view(-1, 1, 1, 1)
+        # Reshape huber_c to broadcast with model_pred (supports 4D and 5D tensors)
        huber_c = huber_c.view(-1, *([1] * (model_pred.ndim - 1)))
        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
        if reduction == "mean":
            loss = torch.mean(loss)
--- a/networks/lora_anima.py
+++ b/networks/lora_anima.py
@@ -0,0 +1,635 @@
 # LoRA network module for Anima 
 import math
 import os
 from typing import Dict, List, Optional, Tuple, Type, Union
 import numpy as np
 import torch
 from library.utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 from networks.lora_flux import LoRAModule, LoRAInfModule
 def create_network(
    multiplier: float,
    network_dim: Optional[int],
    network_alpha: Optional[float],
    vae,
    text_encoders: list,
    unet,
    neuron_dropout: Optional[float] = None,
    **kwargs,
 ):
    if network_dim is None:
        network_dim = 4
    if network_alpha is None:
        network_alpha = 1.0
    # type_dims: [self_attn_dim, cross_attn_dim, mlp_dim, mod_dim, llm_adapter_dim]
    self_attn_dim = kwargs.get("self_attn_dim", None)
    cross_attn_dim = kwargs.get("cross_attn_dim", None)
    mlp_dim = kwargs.get("mlp_dim", None)
    mod_dim = kwargs.get("mod_dim", None)
    llm_adapter_dim = kwargs.get("llm_adapter_dim", None)
    if self_attn_dim is not None:
        self_attn_dim = int(self_attn_dim)
    if cross_attn_dim is not None:
        cross_attn_dim = int(cross_attn_dim)
    if mlp_dim is not None:
        mlp_dim = int(mlp_dim)
    if mod_dim is not None:
        mod_dim = int(mod_dim)
    if llm_adapter_dim is not None:
        llm_adapter_dim = int(llm_adapter_dim)
    type_dims = [self_attn_dim, cross_attn_dim, mlp_dim, mod_dim, llm_adapter_dim]
    if all([d is None for d in type_dims]):
        type_dims = None
    # emb_dims: [x_embedder, t_embedder, final_layer]
    emb_dims = kwargs.get("emb_dims", None)
    if emb_dims is not None:
        emb_dims = emb_dims.strip()
        if emb_dims.startswith("[") and emb_dims.endswith("]"):
            emb_dims = emb_dims[1:-1]
        emb_dims = [int(d) for d in emb_dims.split(",")]
        assert len(emb_dims) == 3, f"invalid emb_dims: {emb_dims}, must be 3 dimensions (x_embedder, t_embedder, final_layer)"
    # block selection
    def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
        if selection == "all":
            return [True] * total_blocks
        if selection == "none" or selection == "":
            return [False] * total_blocks
        selected = [False] * total_blocks
        ranges = selection.split(",")
        for r in ranges:
            if "-" in r:
                start, end = map(str.strip, r.split("-"))
                start, end = int(start), int(end)
                assert 0 <= start < total_blocks and 0 <= end < total_blocks and start <= end
                for i in range(start, end + 1):
                    selected[i] = True
            else:
                index = int(r)
                assert 0 <= index < total_blocks
                selected[index] = True
        return selected
    train_block_indices = kwargs.get("train_block_indices", None)
    if train_block_indices is not None:
        num_blocks = len(unet.blocks) if hasattr(unet, 'blocks') else 999
        train_block_indices = parse_block_selection(train_block_indices, num_blocks)
    # train LLM adapter
    train_llm_adapter = kwargs.get("train_llm_adapter", False)
    if train_llm_adapter is not None:
        train_llm_adapter = True if train_llm_adapter == "True" else False
    # rank/module dropout
    rank_dropout = kwargs.get("rank_dropout", None)
    if rank_dropout is not None:
        rank_dropout = float(rank_dropout)
    module_dropout = kwargs.get("module_dropout", None)
    if module_dropout is not None:
        module_dropout = float(module_dropout)
    # verbose
    verbose = kwargs.get("verbose", False)
    if verbose is not None:
        verbose = True if verbose == "True" else False
    network = LoRANetwork(
        text_encoders,
        unet,
        multiplier=multiplier,
        lora_dim=network_dim,
        alpha=network_alpha,
        dropout=neuron_dropout,
        rank_dropout=rank_dropout,
        module_dropout=module_dropout,
        train_llm_adapter=train_llm_adapter,
        type_dims=type_dims,
        emb_dims=emb_dims,
        train_block_indices=train_block_indices,
        verbose=verbose,
    )
    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
    return network
 def create_network_from_weights(multiplier, file, ae, text_encoders, unet, weights_sd=None, for_inference=False, **kwargs):
    if weights_sd is None:
        if os.path.splitext(file)[1] == ".safetensors":
            from safetensors.torch import load_file
            weights_sd = load_file(file)
        else:
            weights_sd = torch.load(file, map_location="cpu")
    modules_dim = {}
    modules_alpha = {}
    train_llm_adapter = False
    for key, value in weights_sd.items():
        if "." not in key:
            continue
        lora_name = key.split(".")[0]
        if "alpha" in key:
            modules_alpha[lora_name] = value
        elif "lora_down" in key:
            dim = value.size()[0]
            modules_dim[lora_name] = dim
        if "llm_adapter" in lora_name:
            train_llm_adapter = True
    module_class = LoRAInfModule if for_inference else LoRAModule
    network = LoRANetwork(
        text_encoders,
        unet,
        multiplier=multiplier,
        modules_dim=modules_dim,
        modules_alpha=modules_alpha,
        module_class=module_class,
        train_llm_adapter=train_llm_adapter,
    )
    return network, weights_sd
 class LoRANetwork(torch.nn.Module):
    # Target modules: DiT blocks
    ANIMA_TARGET_REPLACE_MODULE = ["Block"]
    # Target modules: LLM Adapter blocks
    ANIMA_ADAPTER_TARGET_REPLACE_MODULE = ["LLMAdapterTransformerBlock"]
    # Target modules for text encoder (Qwen3)
    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["Qwen3Attention", "Qwen3MLP", "Qwen3SdpaAttention", "Qwen3FlashAttention2"]
    LORA_PREFIX_ANIMA = "lora_unet"  # ComfyUI compatible
    LORA_PREFIX_TEXT_ENCODER = "lora_te1"  # Qwen3
    def __init__(
        self,
        text_encoders: list,
        unet,
        multiplier: float = 1.0,
        lora_dim: int = 4,
        alpha: float = 1,
        dropout: Optional[float] = None,
        rank_dropout: Optional[float] = None,
        module_dropout: Optional[float] = None,
        module_class: Type[object] = LoRAModule,
        modules_dim: Optional[Dict[str, int]] = None,
        modules_alpha: Optional[Dict[str, int]] = None,
        train_llm_adapter: bool = False,
        type_dims: Optional[List[int]] = None,
        emb_dims: Optional[List[int]] = None,
        train_block_indices: Optional[List[bool]] = None,
        verbose: Optional[bool] = False,
    ) -> None:
        super().__init__()
        self.multiplier = multiplier
        self.lora_dim = lora_dim
        self.alpha = alpha
        self.dropout = dropout
        self.rank_dropout = rank_dropout
        self.module_dropout = module_dropout
        self.train_llm_adapter = train_llm_adapter
        self.type_dims = type_dims
        self.emb_dims = emb_dims
        self.train_block_indices = train_block_indices
        self.loraplus_lr_ratio = None
        self.loraplus_unet_lr_ratio = None
        self.loraplus_text_encoder_lr_ratio = None
        if modules_dim is not None:
            logger.info(f"create LoRA network from weights")
            if self.emb_dims is None:
                self.emb_dims = [0] * 3
        else:
            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
        # create module instances
        def create_modules(
            is_unet: bool,
            text_encoder_idx: Optional[int],
            root_module: torch.nn.Module,
            target_replace_modules: List[str],
            filter: Optional[str] = None,
            default_dim: Optional[int] = None,
            include_conv2d_if_filter: bool = False,
        ) -> Tuple[List[LoRAModule], List[str]]:
            prefix = (
                self.LORA_PREFIX_ANIMA
                if is_unet
                else self.LORA_PREFIX_TEXT_ENCODER
            )
            loras = []
            skipped = []
            for name, module in root_module.named_modules():
                if target_replace_modules is None or module.__class__.__name__ in target_replace_modules:
                    if target_replace_modules is None:
                        module = root_module
                    for child_name, child_module in module.named_modules():
                        is_linear = child_module.__class__.__name__ == "Linear"
                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
                        if is_linear or is_conv2d:
                            lora_name = prefix + "." + (name + "." if name else "") + child_name
                            lora_name = lora_name.replace(".", "_")
                            force_incl_conv2d = False
                            if filter is not None:
                                if filter not in lora_name:
                                    continue
                                force_incl_conv2d = include_conv2d_if_filter
                            dim = None
                            alpha_val = None
                            if modules_dim is not None:
                                if lora_name in modules_dim:
                                    dim = modules_dim[lora_name]
                                    alpha_val = modules_alpha[lora_name]
                            else:
                                if is_linear or is_conv2d_1x1:
                                    dim = default_dim if default_dim is not None else self.lora_dim
                                    alpha_val = self.alpha
                                    if is_unet and type_dims is not None:
                                        # type_dims = [self_attn_dim, cross_attn_dim, mlp_dim, mod_dim, llm_adapter_dim]
                                        # Order matters: check most specific identifiers first to avoid mismatches.
                                        identifier_order = [
                                            (4, ("llm_adapter",)),         
                                            (3, ("adaln_modulation",)),   
                                            (0, ("self_attn",)),
                                            (1, ("cross_attn",)),
                                            (2, ("mlp",)),
                                        ]
                                        for idx, ids in identifier_order:
                                            d = type_dims[idx]
                                            if d is not None and all(id_str in lora_name for id_str in ids):
                                                dim = d  # 0 means skip
                                                break
                                    # block index filtering
                                    if is_unet and dim and self.train_block_indices is not None and "blocks_" in lora_name:
                                        # Extract block index from lora_name: "lora_unet_blocks_0_self_attn..."
                                        parts = lora_name.split("_")
                                        for pi, part in enumerate(parts):
                                            if part == "blocks" and pi + 1 < len(parts):
                                                try:
                                                    block_index = int(parts[pi + 1])
                                                    if not self.train_block_indices[block_index]:
                                                        dim = 0
                                                except (ValueError, IndexError):
                                                    pass
                                                break
                                elif force_incl_conv2d:
                                    dim = default_dim if default_dim is not None else self.lora_dim
                                    alpha_val = self.alpha
                            if dim is None or dim == 0:
                                if is_linear or is_conv2d_1x1:
                                    skipped.append(lora_name)
                                continue
                            lora = module_class(
                                lora_name,
                                child_module,
                                self.multiplier,
                                dim,
                                alpha_val,
                                dropout=dropout,
                                rank_dropout=rank_dropout,
                                module_dropout=module_dropout,
                            )
                            loras.append(lora)
                    if target_replace_modules is None:
                        break
            return loras, skipped
        # Create LoRA for text encoders (Qwen3 - typically not trained for Anima)
        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
        skipped_te = []
        if text_encoders is not None:
            for i, text_encoder in enumerate(text_encoders):
                if text_encoder is None:
                    continue
                logger.info(f"create LoRA for Text Encoder {i+1}:")
                te_loras, te_skipped = create_modules(
                    False, i, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE
                )
                logger.info(f"create LoRA for Text Encoder {i+1}: {len(te_loras)} modules.")
                self.text_encoder_loras.extend(te_loras)
                skipped_te += te_skipped
        # Create LoRA for DiT blocks
        target_modules = list(LoRANetwork.ANIMA_TARGET_REPLACE_MODULE)
        if train_llm_adapter:
            target_modules.extend(LoRANetwork.ANIMA_ADAPTER_TARGET_REPLACE_MODULE)
        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
        self.unet_loras, skipped_un = create_modules(True, None, unet, target_modules)
        # emb_dims: [x_embedder, t_embedder, final_layer]
        if self.emb_dims:
            for filter_name, in_dim in zip(
                ["x_embedder", "t_embedder", "final_layer"],
                self.emb_dims,
            ):
                loras, _ = create_modules(
                    True, None, unet, None,
                    filter=filter_name, default_dim=in_dim,
                    include_conv2d_if_filter=(filter_name == "x_embedder"),
                )
                self.unet_loras.extend(loras)
        logger.info(f"create LoRA for Anima DiT: {len(self.unet_loras)} modules.")
        if verbose:
            for lora in self.unet_loras:
                logger.info(f"\t{lora.lora_name:60} {lora.lora_dim}, {lora.alpha}")
        skipped = skipped_te + skipped_un
        if verbose and len(skipped) > 0:
            logger.warning(f"dim (rank) is 0, {len(skipped)} LoRA modules are skipped:")
            for name in skipped:
                logger.info(f"\t{name}")
        # assertion: no duplicate names
        names = set()
        for lora in self.text_encoder_loras + self.unet_loras:
            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
            names.add(lora.lora_name)
    def set_multiplier(self, multiplier):
        self.multiplier = multiplier
        for lora in self.text_encoder_loras + self.unet_loras:
            lora.multiplier = self.multiplier
    def set_enabled(self, is_enabled):
        for lora in self.text_encoder_loras + self.unet_loras:
            lora.enabled = is_enabled
    def load_weights(self, file):
        if os.path.splitext(file)[1] == ".safetensors":
            from safetensors.torch import load_file
            weights_sd = load_file(file)
        else:
            weights_sd = torch.load(file, map_location="cpu")
        info = self.load_state_dict(weights_sd, False)
        return info
    def apply_to(self, text_encoders, unet, apply_text_encoder=True, apply_unet=True):
        if apply_text_encoder:
            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
        else:
            self.text_encoder_loras = []
        if apply_unet:
            logger.info(f"enable LoRA for DiT: {len(self.unet_loras)} modules")
        else:
            self.unet_loras = []
        for lora in self.text_encoder_loras + self.unet_loras:
            lora.apply_to()
            self.add_module(lora.lora_name, lora)
    def is_mergeable(self):
        return True
    def merge_to(self, text_encoders, unet, weights_sd, dtype=None, device=None):
        apply_text_encoder = apply_unet = False
        for key in weights_sd.keys():
            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
                apply_text_encoder = True
            elif key.startswith(LoRANetwork.LORA_PREFIX_ANIMA):
                apply_unet = True
        if apply_text_encoder:
            logger.info("enable LoRA for text encoder")
        else:
            self.text_encoder_loras = []
        if apply_unet:
            logger.info("enable LoRA for DiT")
        else:
            self.unet_loras = []
        for lora in self.text_encoder_loras + self.unet_loras:
            sd_for_lora = {}
            for key in weights_sd.keys():
                if key.startswith(lora.lora_name):
                    sd_for_lora[key[len(lora.lora_name) + 1:]] = weights_sd[key]
            lora.merge_to(sd_for_lora, dtype, device)
        logger.info(f"weights are merged")
    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
        self.loraplus_lr_ratio = loraplus_lr_ratio
        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
            text_encoder_lr = [default_lr]
        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
            text_encoder_lr = [float(text_encoder_lr)]
        elif len(text_encoder_lr) == 1:
            pass  # already a list with one element
        self.requires_grad_(True)
        all_params = []
        lr_descriptions = []
        def assemble_params(loras, lr, loraplus_ratio):
            param_groups = {"lora": {}, "plus": {}}
            for lora in loras:
                for name, param in lora.named_parameters():
                    if loraplus_ratio is not None and "lora_up" in name:
                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
                    else:
                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
            params = []
            descriptions = []
            for key in param_groups.keys():
                param_data = {"params": param_groups[key].values()}
                if len(param_data["params"]) == 0:
                    continue
                if lr is not None:
                    if key == "plus":
                        param_data["lr"] = lr * loraplus_ratio
                    else:
                        param_data["lr"] = lr
                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
                    logger.info("NO LR skipping!")
                    continue
                params.append(param_data)
                descriptions.append("plus" if key == "plus" else "")
            return params, descriptions
        if self.text_encoder_loras:
            loraplus_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
            te1_loras = [
                lora for lora in self.text_encoder_loras
                if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER)
            ]
            if len(te1_loras) > 0:
                logger.info(f"Text Encoder 1 (Qwen3): {len(te1_loras)} modules, LR {text_encoder_lr[0]}")
                params, descriptions = assemble_params(te1_loras, text_encoder_lr[0], loraplus_ratio)
                all_params.extend(params)
                lr_descriptions.extend(["textencoder 1" + (" " + d if d else "") for d in descriptions])
        if self.unet_loras:
            params, descriptions = assemble_params(
                self.unet_loras,
                unet_lr if unet_lr is not None else default_lr,
                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
            )
            all_params.extend(params)
            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
        return all_params, lr_descriptions
    def enable_gradient_checkpointing(self):
        pass  # not supported
    def prepare_grad_etc(self, text_encoder, unet):
        self.requires_grad_(True)
    def on_epoch_start(self, text_encoder, unet):
        self.train()
    def get_trainable_params(self):
        return self.parameters()
    def save_weights(self, file, dtype, metadata):
        if metadata is not None and len(metadata) == 0:
            metadata = None
        state_dict = self.state_dict()
        if dtype is not None:
            for key in list(state_dict.keys()):
                v = state_dict[key]
                v = v.detach().clone().to("cpu").to(dtype)
                state_dict[key] = v
        if os.path.splitext(file)[1] == ".safetensors":
            from safetensors.torch import save_file
            from library import train_util
            if metadata is None:
                metadata = {}
            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
            metadata["sshs_model_hash"] = model_hash
            metadata["sshs_legacy_hash"] = legacy_hash
            save_file(state_dict, file, metadata)
        else:
            torch.save(state_dict, file)
    def backup_weights(self):
        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
        for lora in loras:
            org_module = lora.org_module_ref[0]
            if not hasattr(org_module, "_lora_org_weight"):
                sd = org_module.state_dict()
                org_module._lora_org_weight = sd["weight"].detach().clone()
                org_module._lora_restored = True
    def restore_weights(self):
        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
        for lora in loras:
            org_module = lora.org_module_ref[0]
            if not org_module._lora_restored:
                sd = org_module.state_dict()
                sd["weight"] = org_module._lora_org_weight
                org_module.load_state_dict(sd)
                org_module._lora_restored = True
    def pre_calculation(self):
        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
        for lora in loras:
            org_module = lora.org_module_ref[0]
            sd = org_module.state_dict()
            org_weight = sd["weight"]
            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
            sd["weight"] = org_weight + lora_weight
            assert sd["weight"].shape == org_weight.shape
            org_module.load_state_dict(sd)
            org_module._lora_restored = False
            lora.enabled = False
    def apply_max_norm_regularization(self, max_norm_value, device):
        downkeys = []
        upkeys = []
        alphakeys = []
        norms = []
        keys_scaled = 0
        state_dict = self.state_dict()
        for key in state_dict.keys():
            if "lora_down" in key and "weight" in key:
                downkeys.append(key)
                upkeys.append(key.replace("lora_down", "lora_up"))
                alphakeys.append(key.replace("lora_down.weight", "alpha"))
        for i in range(len(downkeys)):
            down = state_dict[downkeys[i]].to(device)
            up = state_dict[upkeys[i]].to(device)
            alpha = state_dict[alphakeys[i]].to(device)
            dim = down.shape[0]
            scale = alpha / dim
            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
            else:
                updown = up @ down
            updown *= scale
            norm = updown.norm().clamp(min=max_norm_value / 2)
            desired = torch.clamp(norm, max=max_norm_value)
            ratio = desired.cpu() / norm.cpu()
            sqrt_ratio = ratio**0.5
            if ratio != 1:
                keys_scaled += 1
                state_dict[upkeys[i]] *= sqrt_ratio
                state_dict[downkeys[i]] *= sqrt_ratio
            scalednorm = updown.norm() * ratio
            norms.append(scalednorm.item())
        return keys_scaled, sum(norms) / len(norms), max(norms)
--- a/tests/test_anima_cache.py
+++ b/tests/test_anima_cache.py
@@ -0,0 +1,617 @@
 """
 Diagnostic script to test Anima latent & text encoder caching independently.
 Usage:
    python test_anima_cache.py \
        --image_dir /path/to/images \
        --qwen3_path /path/to/qwen3 \
        --vae_path /path/to/vae.safetensors \
        [--t5_tokenizer_path /path/to/t5] \
        [--cache_to_disk]
 The image_dir should contain pairs of:
    image1.png + image1.txt
    image2.jpg + image2.txt
    ...
 """
 import argparse
 import glob
 import os
 import sys
 import traceback
 import numpy as np
 import torch
 from PIL import Image
 from torchvision import transforms
 # Helpers
 IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"}
 IMAGE_TRANSFORMS = transforms.Compose([
    transforms.ToTensor(),           # [0,1]
    transforms.Normalize([0.5], [0.5]),  # [-1,1]
 ])
 def find_image_caption_pairs(image_dir: str):
    """Find (image_path, caption_text) pairs from a directory."""
    pairs = []
    for f in sorted(os.listdir(image_dir)):
        ext = os.path.splitext(f)[1].lower()
        if ext not in IMAGE_EXTENSIONS:
            continue
        img_path = os.path.join(image_dir, f)
        txt_path = os.path.splitext(img_path)[0] + ".txt"
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8") as fh:
                caption = fh.read().strip()
        else:
            caption = ""
        pairs.append((img_path, caption))
    return pairs
 def print_tensor_info(name: str, t, indent=2):
    prefix = " " * indent
    if t is None:
        print(f"{prefix}{name}: None")
        return
    if isinstance(t, np.ndarray):
        print(f"{prefix}{name}: numpy {t.dtype} shape={t.shape} "
              f"min={t.min():.4f} max={t.max():.4f} mean={t.mean():.4f}")
    elif isinstance(t, torch.Tensor):
        print(f"{prefix}{name}: torch {t.dtype} shape={tuple(t.shape)} "
              f"min={t.min().item():.4f} max={t.max().item():.4f} mean={t.float().mean().item():.4f}")
    else:
        print(f"{prefix}{name}: type={type(t)} value={t}")
 # Test 1: Latent Cache
 def test_latent_cache(args, pairs):
    print("\n" + "=" * 70)
    print("TEST 1: LATENT CACHING (VAE encode -> cache -> reload)")
    print("=" * 70)
    from library import anima_utils
    from library.anima_models import ANIMA_VAE_MEAN, ANIMA_VAE_STD
    # Load VAE
    print("\n[1.1] Loading VAE...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    vae_dtype = torch.float32
    vae, vae_mean, vae_std, vae_scale = anima_utils.load_anima_vae(
        args.vae_path, dtype=vae_dtype, device=device
    )
    print(f"  VAE loaded on {device}, dtype={vae_dtype}")
    print(f"  VAE mean (first 4): {ANIMA_VAE_MEAN[:4]}")
    print(f"  VAE std  (first 4): {ANIMA_VAE_STD[:4]}")
    for img_path, caption in pairs:
        print(f"\n[1.2] Processing: {os.path.basename(img_path)}")
        # Load image
        img = Image.open(img_path).convert("RGB")
        img_np = np.array(img)
        print(f"  Raw image: {img_np.shape} dtype={img_np.dtype} "
              f"min={img_np.min()} max={img_np.max()}")
        # Apply IMAGE_TRANSFORMS (same as sd-scripts training)
        img_tensor = IMAGE_TRANSFORMS(img_np)
        print(f"  After IMAGE_TRANSFORMS: shape={tuple(img_tensor.shape)} "
              f"min={img_tensor.min():.4f} max={img_tensor.max():.4f}")
        # Check range is [-1, 1]
        if img_tensor.min() < -1.01 or img_tensor.max() > 1.01:
            print("  ** WARNING: tensor out of [-1, 1] range!")
        else:
            print("  OK: tensor in [-1, 1] range")
        # Encode with VAE
        img_batch = img_tensor.unsqueeze(0).to(device, dtype=vae_dtype)  # (1, C, H, W)
        img_5d = img_batch.unsqueeze(2)  # (1, C, 1, H, W) - add temporal dim
        print(f"  VAE input: shape={tuple(img_5d.shape)} dtype={img_5d.dtype}")
        with torch.no_grad():
            latents = vae.encode(img_5d, vae_scale)
        latents_cpu = latents.cpu()
        print_tensor_info("Encoded latents", latents_cpu)
        # Check for NaN/Inf
        if torch.any(torch.isnan(latents_cpu)):
            print("  ** ERROR: NaN in latents!")
        elif torch.any(torch.isinf(latents_cpu)):
            print("  ** ERROR: Inf in latents!")
        else:
            print("  OK: no NaN/Inf")
        # Test disk cache round-trip
        if args.cache_to_disk:
            npz_path = os.path.splitext(img_path)[0] + "_test_latent.npz"
            latents_np = latents_cpu.float().numpy()
            h, w = img_np.shape[:2]
            np.savez(
                npz_path,
                latents=latents_np,
                original_size=np.array([w, h]),
                crop_ltrb=np.array([0, 0, 0, 0]),
            )
            print(f"  Saved to: {npz_path}")
            # Reload
            loaded = np.load(npz_path)
            loaded_latents = loaded["latents"]
            print_tensor_info("Reloaded latents", loaded_latents)
            # Compare
            diff = np.abs(latents_np - loaded_latents).max()
            print(f"  Max diff (save vs load): {diff:.2e}")
            if diff > 1e-5:
                print("  ** WARNING: latent cache round-trip has significant diff!")
            else:
                print("  OK: round-trip matches")
            os.remove(npz_path)
            print(f"  Cleaned up {npz_path}")
    vae.to("cpu")
    del vae
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    print("\n[1.3] Latent cache test DONE.")
 # Test 2: Text Encoder Output Cache
 def test_text_encoder_cache(args, pairs):
    print("\n" + "=" * 70)
    print("TEST 2: TEXT ENCODER OUTPUT CACHING")
    print("=" * 70)
    from library import anima_utils
    # Load tokenizers
    print("\n[2.1] Loading tokenizers...")
    qwen3_tokenizer = anima_utils.load_qwen3_tokenizer(args.qwen3_path)
    t5_tokenizer = anima_utils.load_t5_tokenizer(
        getattr(args, 't5_tokenizer_path', None)
    )
    print(f"  Qwen3 tokenizer vocab: {qwen3_tokenizer.vocab_size}")
    print(f"  T5 tokenizer vocab: {t5_tokenizer.vocab_size}")
    # Load text encoder
    print("\n[2.2] Loading Qwen3 text encoder...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    te_dtype = torch.bfloat16 if device == "cuda" else torch.float32
    qwen3_model, _ = anima_utils.load_qwen3_text_encoder(
        args.qwen3_path, dtype=te_dtype, device=device
    )
    qwen3_model.eval()
    # Create strategy objects
    from library.strategy_anima import AnimaTokenizeStrategy, AnimaTextEncodingStrategy
    tokenize_strategy = AnimaTokenizeStrategy(
        qwen3_tokenizer=qwen3_tokenizer,
        t5_tokenizer=t5_tokenizer,
        qwen3_max_length=args.qwen3_max_length,
        t5_max_length=args.t5_max_length,
    )
    text_encoding_strategy = AnimaTextEncodingStrategy(
        dropout_rate=0.0,
    )
    captions = [cap for _, cap in pairs]
    print(f"\n[2.3] Tokenizing {len(captions)} captions...")
    for i, cap in enumerate(captions):
        print(f"  [{i}] \"{cap[:80]}{'...' if len(cap) > 80 else ''}\"")
    tokens_and_masks = tokenize_strategy.tokenize(captions)
    qwen3_input_ids, qwen3_attn_mask, t5_input_ids, t5_attn_mask = tokens_and_masks
    print(f"\n  Tokenization results:")
    print_tensor_info("qwen3_input_ids", qwen3_input_ids)
    print_tensor_info("qwen3_attn_mask", qwen3_attn_mask)
    print_tensor_info("t5_input_ids", t5_input_ids)
    print_tensor_info("t5_attn_mask", t5_attn_mask)
    # Encode
    print(f"\n[2.4] Encoding with Qwen3 text encoder...")
    with torch.no_grad():
        prompt_embeds, attn_mask, t5_ids_out, t5_mask_out = text_encoding_strategy.encode_tokens(
            tokenize_strategy,
            [qwen3_model],
            tokens_and_masks,
            enable_dropout=False,
        )
    print(f"  Encoding results:")
    print_tensor_info("prompt_embeds", prompt_embeds)
    print_tensor_info("attn_mask", attn_mask)
    print_tensor_info("t5_input_ids", t5_ids_out)
    print_tensor_info("t5_attn_mask", t5_mask_out)
    # Check for NaN/Inf
    if torch.any(torch.isnan(prompt_embeds)):
        print("  ** ERROR: NaN in prompt_embeds!")
    elif torch.any(torch.isinf(prompt_embeds)):
        print("  ** ERROR: Inf in prompt_embeds!")
    else:
        print("  OK: no NaN/Inf in prompt_embeds")
    # Test cache round-trip (simulate what AnimaTextEncoderOutputsCachingStrategy does)
    print(f"\n[2.5] Testing cache round-trip (encode -> numpy -> npz -> reload -> tensor)...")
    # Convert to numpy (same as cache_batch_outputs in strategy_anima.py)
    pe_cpu = prompt_embeds.cpu()
    if pe_cpu.dtype == torch.bfloat16:
        pe_cpu = pe_cpu.float()
    pe_np = pe_cpu.numpy()
    am_np = attn_mask.cpu().numpy()
    t5_ids_np = t5_ids_out.cpu().numpy().astype(np.int32)
    t5_mask_np = t5_mask_out.cpu().numpy().astype(np.int32)
    print(f"  Numpy conversions:")
    print_tensor_info("prompt_embeds_np", pe_np)
    print_tensor_info("attn_mask_np", am_np)
    print_tensor_info("t5_input_ids_np", t5_ids_np)
    print_tensor_info("t5_attn_mask_np", t5_mask_np)
    if args.cache_to_disk:
        npz_path = os.path.join(args.image_dir, "_test_te_cache.npz")
        # Save per-sample (simulating cache_batch_outputs)
        for i in range(len(captions)):
            sample_npz = os.path.splitext(pairs[i][0])[0] + "_test_te.npz"
            np.savez(
                sample_npz,
                prompt_embeds=pe_np[i],
                attn_mask=am_np[i],
                t5_input_ids=t5_ids_np[i],
                t5_attn_mask=t5_mask_np[i],
            )
            print(f"  Saved: {sample_npz}")
            # Reload (simulating load_outputs_npz)
            data = np.load(sample_npz)
            print(f"  Reloaded keys: {list(data.keys())}")
            print_tensor_info("  loaded prompt_embeds", data["prompt_embeds"], indent=4)
            print_tensor_info("  loaded attn_mask", data["attn_mask"], indent=4)
            print_tensor_info("  loaded t5_input_ids", data["t5_input_ids"], indent=4)
            print_tensor_info("  loaded t5_attn_mask", data["t5_attn_mask"], indent=4)
            # Check diff
            diff_pe = np.abs(pe_np[i] - data["prompt_embeds"]).max()
            diff_t5 = np.abs(t5_ids_np[i] - data["t5_input_ids"]).max()
            print(f"    Max diff prompt_embeds: {diff_pe:.2e}")
            print(f"    Max diff t5_input_ids: {diff_t5:.2e}")
            if diff_pe > 1e-5 or diff_t5 > 0:
                print("    ** WARNING: cache round-trip mismatch!")
            else:
                print("    OK: round-trip matches")
            os.remove(sample_npz)
            print(f"    Cleaned up {sample_npz}")
    # Test in-memory cache round-trip (simulating what __getitem__ does)
    print(f"\n[2.6] Testing in-memory cache simulation (tuple -> none_or_stack_elements -> batch)...")
    # Simulate per-sample storage (like info.text_encoder_outputs = tuple)
    per_sample_cached = []
    for i in range(len(captions)):
        per_sample_cached.append((pe_np[i], am_np[i], t5_ids_np[i], t5_mask_np[i]))
    # Simulate none_or_stack_elements with torch.FloatTensor converter
    # This is what train_util.py __getitem__ does at line 1784
    stacked = []
    for elem_idx in range(4):
        arrays = [sample[elem_idx] for sample in per_sample_cached]
        stacked.append(torch.stack([torch.FloatTensor(a) for a in arrays]))
    print(f"  Stacked batch (like batch['text_encoder_outputs_list']):")
    names = ["prompt_embeds", "attn_mask", "t5_input_ids", "t5_attn_mask"]
    for name, tensor in zip(names, stacked):
        print_tensor_info(name, tensor)
    # Check condition: len(text_encoder_conds) == 0 or text_encoder_conds[0] is None
    text_encoder_conds = stacked
    cond_check_1 = len(text_encoder_conds) == 0
    cond_check_2 = text_encoder_conds[0] is None
    print(f"\n  Condition check (should both be False when caching works):")
    print(f"    len(text_encoder_conds) == 0 : {cond_check_1}")
    print(f"    text_encoder_conds[0] is None: {cond_check_2}")
    if not cond_check_1 and not cond_check_2:
        print("    OK: cached text encoder outputs would be used")
    else:
        print("    ** BUG: code would try to re-encode (and crash on None input_ids_list)!")
    # Test unpack for get_noise_pred_and_target (line 311)
    print(f"\n[2.7] Testing unpack: prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = text_encoder_conds")
    try:
        pe_batch, am_batch, t5_ids_batch, t5_mask_batch = text_encoder_conds
        print(f"  Unpack OK")
        print_tensor_info("prompt_embeds", pe_batch)
        print_tensor_info("attn_mask", am_batch)
        print_tensor_info("t5_input_ids", t5_ids_batch)
        print_tensor_info("t5_attn_mask", t5_mask_batch)
        # Check t5_input_ids are integers (they were converted to FloatTensor!)
        if t5_ids_batch.dtype != torch.long and t5_ids_batch.dtype != torch.int32:
            print(f"\n  ** NOTE: t5_input_ids dtype is {t5_ids_batch.dtype}, will be cast to long at line 316")
            t5_ids_long = t5_ids_batch.to(dtype=torch.long)
            # Check if any precision was lost
            diff = (t5_ids_batch - t5_ids_long.float()).abs().max()
            print(f"    Float->Long precision loss: {diff:.2e}")
            if diff > 0.5:
                print("    ** ERROR: token IDs corrupted by float conversion!")
            else:
                print("    OK: float->long conversion is lossless for these IDs")
    except Exception as e:
        print(f"  ** ERROR unpacking: {e}")
        traceback.print_exc()
    # Test drop_cached_text_encoder_outputs
    print(f"\n[2.8] Testing drop_cached_text_encoder_outputs (caption dropout)...")
    dropout_strategy = AnimaTextEncodingStrategy(
        dropout_rate=0.5,  # high rate to ensure some drops
    )
    dropped = dropout_strategy.drop_cached_text_encoder_outputs(*stacked)
    print(f"  Returned {len(dropped)} tensors")
    for name, tensor in zip(names, dropped):
        print_tensor_info(f"dropped_{name}", tensor)
    # Check which items were dropped
    for i in range(len(captions)):
        is_zero = (dropped[0][i].abs().sum() == 0).item()
        print(f"  Sample {i}: {'DROPPED' if is_zero else 'KEPT'}")
    qwen3_model.to("cpu")
    del qwen3_model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    print("\n[2.8] Text encoder cache test DONE.")
 # Test 3: Full batch simulation
 def test_full_batch_simulation(args, pairs):
    print("\n" + "=" * 70)
    print("TEST 3: FULL BATCH SIMULATION (mimics process_batch flow)")
    print("=" * 70)
    from library import anima_utils
    from library.anima_models import ANIMA_VAE_MEAN, ANIMA_VAE_STD
    from library.strategy_anima import AnimaTokenizeStrategy, AnimaTextEncodingStrategy
    device = "cuda" if torch.cuda.is_available() else "cpu"
    te_dtype = torch.bfloat16 if device == "cuda" else torch.float32
    vae_dtype = torch.float32
    # Load all models
    print("\n[3.1] Loading models...")
    qwen3_tokenizer = anima_utils.load_qwen3_tokenizer(args.qwen3_path)
    t5_tokenizer = anima_utils.load_t5_tokenizer(getattr(args, 't5_tokenizer_path', None))
    qwen3_model, _ = anima_utils.load_qwen3_text_encoder(args.qwen3_path, dtype=te_dtype, device=device)
    qwen3_model.eval()
    vae, _, _, vae_scale = anima_utils.load_anima_vae(args.vae_path, dtype=vae_dtype, device=device)
    tokenize_strategy = AnimaTokenizeStrategy(
        qwen3_tokenizer=qwen3_tokenizer, t5_tokenizer=t5_tokenizer,
        qwen3_max_length=args.qwen3_max_length, t5_max_length=args.t5_max_length,
    )
    text_encoding_strategy = AnimaTextEncodingStrategy(dropout_rate=0.0)
    captions = [cap for _, cap in pairs]
    # --- Simulate caching phase ---
    print("\n[3.2] Simulating text encoder caching phase...")
    tokens_and_masks = tokenize_strategy.tokenize(captions)
    with torch.no_grad():
        te_outputs = text_encoding_strategy.encode_tokens(
            tokenize_strategy, [qwen3_model], tokens_and_masks, enable_dropout=False,
        )
    prompt_embeds, attn_mask, t5_input_ids, t5_attn_mask = te_outputs
    # Convert to numpy (same as cache_batch_outputs)
    pe_np = prompt_embeds.cpu().float().numpy()
    am_np = attn_mask.cpu().numpy()
    t5_ids_np = t5_input_ids.cpu().numpy().astype(np.int32)
    t5_mask_np = t5_attn_mask.cpu().numpy().astype(np.int32)
    # Per-sample storage (like info.text_encoder_outputs)
    per_sample_te = [(pe_np[i], am_np[i], t5_ids_np[i], t5_mask_np[i]) for i in range(len(captions))]
    print(f"\n[3.3] Simulating latent caching phase...")
    per_sample_latents = []
    for img_path, _ in pairs:
        img = Image.open(img_path).convert("RGB")
        img_np = np.array(img)
        img_tensor = IMAGE_TRANSFORMS(img_np).unsqueeze(0).unsqueeze(2)  # (1,C,1,H,W)
        img_tensor = img_tensor.to(device, dtype=vae_dtype)
        with torch.no_grad():
            lat = vae.encode(img_tensor, vae_scale).cpu()
        per_sample_latents.append(lat.squeeze(0))  # (C,1,H,W)
        print(f"  {os.path.basename(img_path)}: latent shape={tuple(lat.shape)}")
    # --- Simulate batch construction (__getitem__) ---
    print(f"\n[3.4] Simulating batch construction...")
    # Use first image's latents only (images may have different resolutions)
    latents_batch = per_sample_latents[0].unsqueeze(0)  # (1,C,1,H,W)
    print(f"  Using first image latent for simulation: shape={tuple(latents_batch.shape)}")
    # Stack text encoder outputs (none_or_stack_elements)
    text_encoder_outputs_list = []
    for elem_idx in range(4):
        arrays = [s[elem_idx] for s in per_sample_te]
        text_encoder_outputs_list.append(torch.stack([torch.FloatTensor(a) for a in arrays]))
    # input_ids_list is None when caching
    input_ids_list = None
    batch = {
        "latents": latents_batch,
        "text_encoder_outputs_list": text_encoder_outputs_list,
        "input_ids_list": input_ids_list,
        "loss_weights": torch.ones(len(captions)),
    }
    print(f"  batch keys: {list(batch.keys())}")
    print(f"  batch['latents']: shape={tuple(batch['latents'].shape)}")
    print(f"  batch['text_encoder_outputs_list']: {len(batch['text_encoder_outputs_list'])} tensors")
    print(f"  batch['input_ids_list']: {batch['input_ids_list']}")
    # --- Simulate process_batch logic ---
    print(f"\n[3.5] Simulating process_batch logic...")
    text_encoder_conds = []
    te_out = batch.get("text_encoder_outputs_list", None)
    if te_out is not None:
        text_encoder_conds = te_out
        print(f"  text_encoder_conds loaded from cache: {len(text_encoder_conds)} tensors")
    else:
        print(f"  text_encoder_conds: empty (no cache)")
    # The critical condition
    train_text_encoder_TRUE = True   # OLD behavior (base class default, no override)
    train_text_encoder_FALSE = False  # NEW behavior (with is_train_text_encoder override)
    cond_old = len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder_TRUE
    cond_new = len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder_FALSE
    print(f"\n  === CRITICAL CONDITION CHECK ===")
    print(f"  len(text_encoder_conds) == 0 : {len(text_encoder_conds) == 0}")
    print(f"  text_encoder_conds[0] is None: {text_encoder_conds[0] is None}")
    print(f"  train_text_encoder (OLD=True) : {train_text_encoder_TRUE}")
    print(f"  train_text_encoder (NEW=False): {train_text_encoder_FALSE}")
    print(f"")
    print(f"  Condition with OLD behavior (no override): {cond_old}")
    msg = (
        "ENTERS re-encode block -> accesses batch['input_ids_list'] -> CRASH!"
        if cond_old
        else "SKIPS re-encode block -> uses cache -> OK"
    )
    print(f"    -> {msg}")
    print(f"  Condition with NEW behavior (override):    {cond_new}")
    print(f"    -> {'ENTERS re-encode block' if cond_new else 'SKIPS re-encode block -> uses cache -> OK'}")
    if cond_old and not cond_new:
        print(f"\n  ** CONFIRMED: the is_train_text_encoder override fixes the crash **")
    # Simulate the rest of process_batch
    print(f"\n[3.6] Simulating get_noise_pred_and_target unpack...")
    try:
        pe, am, t5_ids, t5_mask = text_encoder_conds
        pe = pe.to(device, dtype=te_dtype)
        am = am.to(device)
        t5_ids = t5_ids.to(device, dtype=torch.long)
        t5_mask = t5_mask.to(device)
        print(f"  Unpack + device transfer OK:")
        print_tensor_info("prompt_embeds", pe)
        print_tensor_info("attn_mask", am)
        print_tensor_info("t5_input_ids", t5_ids)
        print_tensor_info("t5_attn_mask", t5_mask)
        # Verify t5_input_ids didn't get corrupted by float conversion
        t5_ids_orig = torch.tensor(t5_ids_np, dtype=torch.long, device=device)
        id_match = torch.all(t5_ids == t5_ids_orig).item()
        print(f"\n  t5_input_ids integrity (float->long roundtrip): {'OK' if id_match else '** MISMATCH **'}")
        if not id_match:
            diff_count = (t5_ids != t5_ids_orig).sum().item()
            print(f"    {diff_count} token IDs differ!")
            # Show example
            idx = torch.where(t5_ids != t5_ids_orig)
            if len(idx[0]) > 0:
                i, j = idx[0][0].item(), idx[1][0].item()
                print(f"    Example: position [{i},{j}] original={t5_ids_orig[i,j].item()} loaded={t5_ids[i,j].item()}")
    except Exception as e:
        print(f"  ** ERROR: {e}")
        traceback.print_exc()
    # Cleanup
    vae.to("cpu")
    qwen3_model.to("cpu")
    del vae, qwen3_model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    print("\n[3.7] Full batch simulation DONE.")
 # Main
 def main():
    parser = argparse.ArgumentParser(description="Test Anima caching mechanisms")
    parser.add_argument("--image_dir", type=str, required=True,
                        help="Directory with image+txt pairs")
    parser.add_argument("--qwen3_path", type=str, required=True,
                        help="Path to Qwen3 model (directory or safetensors)")
    parser.add_argument("--vae_path", type=str, required=True,
                        help="Path to WanVAE safetensors")
    parser.add_argument("--t5_tokenizer_path", type=str, default=None,
                        help="Path to T5 tokenizer (optional, uses bundled config)")
    parser.add_argument("--qwen3_max_length", type=int, default=512)
    parser.add_argument("--t5_max_length", type=int, default=512)
    parser.add_argument("--cache_to_disk", action="store_true",
                        help="Also test disk cache round-trip")
    parser.add_argument("--skip_latent", action="store_true",
                        help="Skip latent cache test")
    parser.add_argument("--skip_text", action="store_true",
                        help="Skip text encoder cache test")
    parser.add_argument("--skip_full", action="store_true",
                        help="Skip full batch simulation")
    args = parser.parse_args()
    # Find pairs
    pairs = find_image_caption_pairs(args.image_dir)
    if len(pairs) == 0:
        print(f"ERROR: No image+txt pairs found in {args.image_dir}")
        print("Expected: image.png + image.txt, image.jpg + image.txt, etc.")
        sys.exit(1)
    print(f"Found {len(pairs)} image-caption pairs:")
    for img_path, cap in pairs:
        print(f"  {os.path.basename(img_path)}: \"{cap[:60]}{'...' if len(cap) > 60 else ''}\"")
    results = {}
    if not args.skip_latent:
        try:
            test_latent_cache(args, pairs)
            results["latent_cache"] = "PASS"
        except Exception as e:
            print(f"\n** LATENT CACHE TEST FAILED: {e}")
            traceback.print_exc()
            results["latent_cache"] = f"FAIL: {e}"
    if not args.skip_text:
        try:
            test_text_encoder_cache(args, pairs)
            results["text_encoder_cache"] = "PASS"
        except Exception as e:
            print(f"\n** TEXT ENCODER CACHE TEST FAILED: {e}")
            traceback.print_exc()
            results["text_encoder_cache"] = f"FAIL: {e}"
    if not args.skip_full:
        try:
            test_full_batch_simulation(args, pairs)
            results["full_batch_sim"] = "PASS"
        except Exception as e:
            print(f"\n** FULL BATCH SIMULATION FAILED: {e}")
            traceback.print_exc()
            results["full_batch_sim"] = f"FAIL: {e}"
    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    for test, result in results.items():
        status = "OK" if result == "PASS" else "FAIL"
        print(f"  [{status}] {test}: {result}")
    print()
 if __name__ == "__main__":
    main()
--- a/tests/test_anima_real_training.py
+++ b/tests/test_anima_real_training.py
@@ -0,0 +1,242 @@
 """
 Test script that actually runs anima_train.py and anima_train_network.py
 for a few steps to verify --cache_text_encoder_outputs works.
 Usage:
    python test_anima_real_training.py \
        --image_dir /path/to/images_with_txt \
        --dit_path /path/to/dit.safetensors \
        --qwen3_path /path/to/qwen3 \
        --vae_path /path/to/vae.safetensors \
        [--t5_tokenizer_path /path/to/t5] \
        [--resolution 512]
 This will run 4 tests:
    1. anima_train.py           (full finetune, no cache)
    2. anima_train.py           (full finetune, --cache_text_encoder_outputs)
    3. anima_train_network.py   (LoRA, no cache)
    4. anima_train_network.py   (LoRA, --cache_text_encoder_outputs)
 Each test runs only 2 training steps then stops.
 """
 import argparse
 import os
 import subprocess
 import sys
 import tempfile
 import shutil
 def create_dataset_toml(image_dir: str, resolution: int, toml_path: str):
    """Create a minimal dataset toml config."""
    content = f"""[general]
 resolution = {resolution}
 enable_bucket = true
 bucket_reso_steps = 8
 min_bucket_reso = 256
 max_bucket_reso = 1024
 [[datasets]]
 batch_size = 1
  [[datasets.subsets]]
  image_dir = "{image_dir}"
  num_repeats = 1
  caption_extension = ".txt"
 """
    with open(toml_path, "w", encoding="utf-8") as f:
        f.write(content)
    return toml_path
 def run_test(test_name: str, cmd: list, timeout: int = 300) -> dict:
    """Run a training command and capture result."""
    print(f"\n{'=' * 70}")
    print(f"TEST: {test_name}")
    print(f"{'=' * 70}")
    print(f"Command: {' '.join(cmd)}\n")
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout,
            cwd=os.path.dirname(os.path.abspath(__file__)),
        )
        stdout = result.stdout
        stderr = result.stderr
        returncode = result.returncode
        # Print last N lines of output
        all_output = stdout + "\n" + stderr
        lines = all_output.strip().split("\n")
        print(f"--- Last 30 lines of output ---")
        for line in lines[-30:]:
            print(f"  {line}")
        print(f"--- End output ---\n")
        if returncode == 0:
            print(f"RESULT: PASS (exit code 0)")
            return {"status": "PASS", "detail": "completed successfully"}
        else:
            # Check if it's a known error
            if "TypeError: 'NoneType' object is not iterable" in all_output:
                print(f"RESULT: FAIL - input_ids_list is None (the cache_text_encoder_outputs bug)")
                return {"status": "FAIL", "detail": "input_ids_list is None - cache TE outputs bug"}
            elif "steps:   0%" in all_output and "Error" in all_output:
                # Find the actual error
                error_lines = [l for l in lines if "Error" in l or "Traceback" in l or "raise" in l.lower()]
                detail = error_lines[-1] if error_lines else f"exit code {returncode}"
                print(f"RESULT: FAIL - {detail}")
                return {"status": "FAIL", "detail": detail}
            else:
                print(f"RESULT: FAIL (exit code {returncode})")
                return {"status": "FAIL", "detail": f"exit code {returncode}"}
    except subprocess.TimeoutExpired:
        print(f"RESULT: TIMEOUT (>{timeout}s)")
        return {"status": "TIMEOUT", "detail": f"exceeded {timeout}s"}
    except Exception as e:
        print(f"RESULT: ERROR - {e}")
        return {"status": "ERROR", "detail": str(e)}
 def main():
    parser = argparse.ArgumentParser(description="Test Anima real training with cache flags")
    parser.add_argument("--image_dir", type=str, required=True,
                        help="Directory with image+txt pairs")
    parser.add_argument("--dit_path", type=str, required=True,
                        help="Path to Anima DiT safetensors")
    parser.add_argument("--qwen3_path", type=str, required=True,
                        help="Path to Qwen3 model")
    parser.add_argument("--vae_path", type=str, required=True,
                        help="Path to WanVAE safetensors")
    parser.add_argument("--t5_tokenizer_path", type=str, default=None)
    parser.add_argument("--resolution", type=int, default=512)
    parser.add_argument("--timeout", type=int, default=300,
                        help="Timeout per test in seconds (default: 300)")
    parser.add_argument("--only", type=str, default=None,
                        choices=["finetune", "lora"],
                        help="Only run finetune or lora tests")
    args = parser.parse_args()
    # Validate paths
    for name, path in [("image_dir", args.image_dir), ("dit_path", args.dit_path),
                        ("qwen3_path", args.qwen3_path), ("vae_path", args.vae_path)]:
        if not os.path.exists(path):
            print(f"ERROR: {name} does not exist: {path}")
            sys.exit(1)
    # Create temp dir for outputs
    tmp_dir = tempfile.mkdtemp(prefix="anima_test_")
    print(f"Temp directory: {tmp_dir}")
    # Create dataset toml
    toml_path = os.path.join(tmp_dir, "dataset.toml")
    create_dataset_toml(args.image_dir, args.resolution, toml_path)
    print(f"Dataset config: {toml_path}")
    output_dir = os.path.join(tmp_dir, "output")
    os.makedirs(output_dir, exist_ok=True)
    python = sys.executable
    # Common args for both scripts
    common_anima_args = [
        "--dit_path", args.dit_path,
        "--qwen3_path", args.qwen3_path,
        "--vae_path", args.vae_path,
        "--pretrained_model_name_or_path", args.dit_path,  # required by base parser
        "--output_dir", output_dir,
        "--output_name", "test",
        "--dataset_config", toml_path,
        "--max_train_steps", "2",
        "--learning_rate", "1e-5",
        "--mixed_precision", "bf16",
        "--save_every_n_steps", "999",  # don't save
        "--max_data_loader_n_workers", "0",  # single process for clarity
        "--logging_dir", os.path.join(tmp_dir, "logs"),
        "--cache_latents",
    ]
    if args.t5_tokenizer_path:
        common_anima_args += ["--t5_tokenizer_path", args.t5_tokenizer_path]
    results = {}
    # TEST 1: anima_train.py - NO cache_text_encoder_outputs
    if args.only is None or args.only == "finetune":
        cmd = [python, "anima_train.py"] + common_anima_args + [
            "--optimizer_type", "AdamW8bit",
        ]
        results["finetune_no_cache"] = run_test(
            "anima_train.py (full finetune, NO text encoder cache)",
            cmd, args.timeout,
        )
        # TEST 2: anima_train.py - WITH cache_text_encoder_outputs
        cmd = [python, "anima_train.py"] + common_anima_args + [
            "--optimizer_type", "AdamW8bit",
            "--cache_text_encoder_outputs",
        ]
        results["finetune_with_cache"] = run_test(
            "anima_train.py (full finetune, WITH --cache_text_encoder_outputs)",
            cmd, args.timeout,
        )
    # TEST 3: anima_train_network.py - NO cache_text_encoder_outputs
    if args.only is None or args.only == "lora":
        lora_args = common_anima_args + [
            "--optimizer_type", "AdamW8bit",
            "--network_module", "networks.lora_anima",
            "--network_dim", "4",
            "--network_alpha", "1",
        ]
        cmd = [python, "anima_train_network.py"] + lora_args
        results["lora_no_cache"] = run_test(
            "anima_train_network.py (LoRA, NO text encoder cache)",
            cmd, args.timeout,
        )
        # TEST 4: anima_train_network.py - WITH cache_text_encoder_outputs
        cmd = [python, "anima_train_network.py"] + lora_args + [
            "--cache_text_encoder_outputs",
        ]
        results["lora_with_cache"] = run_test(
            "anima_train_network.py (LoRA, WITH --cache_text_encoder_outputs)",
            cmd, args.timeout,
        )
    # SUMMARY
    print(f"\n{'=' * 70}")
    print("SUMMARY")
    print(f"{'=' * 70}")
    all_pass = True
    for test_name, result in results.items():
        status = result["status"]
        icon = "OK" if status == "PASS" else "FAIL"
        if status != "PASS":
            all_pass = False
        print(f"  [{icon:4s}] {test_name}: {result['detail']}")
    print(f"\nTemp directory (can delete): {tmp_dir}")
    # Cleanup
    try:
        shutil.rmtree(tmp_dir)
        print("Temp directory cleaned up.")
    except Exception:
        print(f"Note: could not clean up {tmp_dir}")
    if all_pass:
        print("\nAll tests PASSED!")
    else:
        print("\nSome tests FAILED!")
        sys.exit(1)
 if __name__ == "__main__":
    main()