From e64dc05c2a704a3400e6f969c0b6ff9914d226dd Mon Sep 17 00:00:00 2001 From: laolongboy <675077044@qq.com> Date: Mon, 24 Mar 2025 23:33:25 +0800 Subject: [PATCH 1/5] Supplement the input parameters to correctly convert the flux model to BFL format; fixes #1996 --- tools/convert_diffusers_to_flux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/convert_diffusers_to_flux.py b/tools/convert_diffusers_to_flux.py index 65ba7321..fdfc4592 100644 --- a/tools/convert_diffusers_to_flux.py +++ b/tools/convert_diffusers_to_flux.py @@ -56,7 +56,7 @@ def convert(args): save_dtype = str_to_dtype(args.save_precision) if args.save_precision is not None else None # make reverse map from diffusers map - diffusers_to_bfl_map = flux_utils.make_diffusers_to_bfl_map() + diffusers_to_bfl_map = flux_utils.make_diffusers_to_bfl_map(19, 38) # iterate over three safetensors files to reduce memory usage flux_sd = {} From 6a826d21b1dfc631a02e517198fac83f793b2f90 Mon Sep 17 00:00:00 2001 From: Kohya S <52813779+kohya-ss@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:06:17 +0900 Subject: [PATCH 2/5] feat: add new parameters for sample image inference configuration --- hunyuan_image_train_network.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py index a67e931d..9ab351ea 100644 --- a/hunyuan_image_train_network.py +++ b/hunyuan_image_train_network.py @@ -249,7 +249,15 @@ def sample_image_inference( arg_c_null = None gen_args = SimpleNamespace( - image_size=(height, width), infer_steps=sample_steps, flow_shift=flow_shift, guidance_scale=cfg_scale, fp8=args.fp8_scaled + image_size=(height, width), + infer_steps=sample_steps, + flow_shift=flow_shift, + guidance_scale=cfg_scale, + fp8=args.fp8_scaled, + apg_start_step_ocr=38, + apg_start_step_general=5, + guidance_rescale=0.0, + guidance_rescale_apg=0.0, ) from hunyuan_image_minimal_inference import generate_body # import here to avoid circular import From a0c26a0efac8c56905153bee8870bcfbb6f96731 Mon Sep 17 00:00:00 2001 From: kohya-ss <52813779+kohya-ss@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:21:25 +0900 Subject: [PATCH 3/5] docs: enhance text encoder CPU usage instructions for HunyuanImage-2.1 training --- docs/hunyuan_image_train_network.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md index b2bf113d..b0e9cdd9 100644 --- a/docs/hunyuan_image_train_network.md +++ b/docs/hunyuan_image_train_network.md @@ -190,7 +190,7 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like * `--fp8_vl` - Use FP8 for the VLM (Qwen2.5-VL) text encoder. * `--text_encoder_cpu` - - Runs the text encoders on CPU to reduce VRAM usage. This is useful when VRAM is insufficient (less than 12GB). Encoding one text may take a few minutes (depending on CPU). It is highly recommended to use this option with `--cache_text_encoder_outputs_to_disk` to avoid repeated encoding every time training starts. + - Runs the text encoders on CPU to reduce VRAM usage. This is useful when VRAM is insufficient (less than 12GB). Encoding one text may take a few minutes (depending on CPU). It is highly recommended to use this option with `--cache_text_encoder_outputs_to_disk` to avoid repeated encoding every time training starts. **In addition, increasing `--num_cpu_threads_per_process` in the `accelerate launch` command, like `--num_cpu_threads_per_process=8` or `16`, can speed up encoding in some environments.** * `--blocks_to_swap=` **[Experimental Feature]** - Setting to reduce VRAM usage by swapping parts of the model (Transformer blocks) between CPU and GPU. Specify the number of blocks to swap as an integer (e.g., `18`). Larger values reduce VRAM usage but decrease training speed. Adjust according to your GPU's VRAM capacity. Can be used with `gradient_checkpointing`. * `--cache_text_encoder_outputs` From 60bfa97b190b6993c80787f6a02b6dd884186a09 Mon Sep 17 00:00:00 2001 From: Kohya S <52813779+kohya-ss@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:52:48 +0900 Subject: [PATCH 4/5] fix: disable_mmap_safetensors not defined in SDXL TI training --- library/sdxl_train_util.py | 25 ++++++++++++++----------- sdxl_train_textual_inversion.py | 8 ++++---- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py index f78d9424..5ac9eb3b 100644 --- a/library/sdxl_train_util.py +++ b/library/sdxl_train_util.py @@ -327,15 +327,18 @@ def save_sd_model_on_epoch_end_or_stepwise( ) -def add_sdxl_training_arguments(parser: argparse.ArgumentParser): - parser.add_argument( - "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする" - ) - parser.add_argument( - "--cache_text_encoder_outputs_to_disk", - action="store_true", - help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする", - ) +def add_sdxl_training_arguments(parser: argparse.ArgumentParser, support_text_encoder_caching: bool = True): + if support_text_encoder_caching: + parser.add_argument( + "--cache_text_encoder_outputs", + action="store_true", + help="cache text encoder outputs / text encoderの出力をキャッシュする", + ) + parser.add_argument( + "--cache_text_encoder_outputs_to_disk", + action="store_true", + help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする", + ) parser.add_argument( "--disable_mmap_load_safetensors", action="store_true", @@ -343,7 +346,7 @@ def add_sdxl_training_arguments(parser: argparse.ArgumentParser): ) -def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True): +def verify_sdxl_training_args(args: argparse.Namespace, support_text_encoder_caching: bool = True): assert not args.v2, "v2 cannot be enabled in SDXL training / SDXL学習ではv2を有効にすることはできません" if args.clip_skip is not None: @@ -366,7 +369,7 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin not hasattr(args, "weighted_captions") or not args.weighted_captions ), "weighted_captions cannot be enabled in SDXL training currently / SDXL学習では今のところweighted_captionsを有効にすることはできません" - if supportTextEncoderCaching: + if support_text_encoder_caching: if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs: args.cache_text_encoder_outputs = True logger.warning( diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py index 5df739e2..d8422f08 100644 --- a/sdxl_train_textual_inversion.py +++ b/sdxl_train_textual_inversion.py @@ -5,6 +5,7 @@ import regex import torch from library.device_utils import init_ipex + init_ipex() from library import sdxl_model_util, sdxl_train_util, train_util @@ -19,8 +20,8 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine self.is_sdxl = True def assert_extra_args(self, args, train_dataset_group): - super().assert_extra_args(args, train_dataset_group) - sdxl_train_util.verify_sdxl_training_args(args, supportTextEncoderCaching=False) + # super().assert_extra_args(args, train_dataset_group) # do not call parent because it checks reso steps with 64 + sdxl_train_util.verify_sdxl_training_args(args, support_text_encoder_caching=False) train_dataset_group.verify_bucket_reso_steps(32) @@ -122,8 +123,7 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine def setup_parser() -> argparse.ArgumentParser: parser = train_textual_inversion.setup_parser() - # don't add sdxl_train_util.add_sdxl_training_arguments(parser): because it only adds text encoder caching - # sdxl_train_util.add_sdxl_training_arguments(parser) + sdxl_train_util.add_sdxl_training_arguments(parser, support_text_encoder_caching=False) return parser From f9710863ca0e80d3d781c2f04ef1a23e03d9fd90 Mon Sep 17 00:00:00 2001 From: rockerBOO Date: Fri, 10 Oct 2025 15:58:21 -0400 Subject: [PATCH 5/5] Set is_swapping_blocks before loading_device, add warning for ignoring fp8_scaled if already fp8 --- flux_train_network.py | 4 ++-- library/fp8_optimization_utils.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/flux_train_network.py b/flux_train_network.py index cfc61708..8620a6f2 100644 --- a/flux_train_network.py +++ b/flux_train_network.py @@ -99,6 +99,8 @@ class FluxNetworkTrainer(train_network.NetworkTrainer): def load_target_model(self, args, weight_dtype, accelerator): # currently offload to cpu for some models + self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0 + # if the file is fp8 and we are using fp8_base, we can load it as is (fp8) loading_dtype = None if args.fp8_base else weight_dtype @@ -125,8 +127,6 @@ class FluxNetworkTrainer(train_network.NetworkTrainer): # if args.split_mode: # model = self.prepare_split_model(model, weight_dtype, accelerator) - - self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0 if self.is_swapping_blocks: # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes. logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}") diff --git a/library/fp8_optimization_utils.py b/library/fp8_optimization_utils.py index 02f99ab6..9ea62a58 100644 --- a/library/fp8_optimization_utils.py +++ b/library/fp8_optimization_utils.py @@ -306,11 +306,22 @@ def load_safetensors_with_fp8_optimization( state_dict[key] = value continue + original_dtype = value.dtype + + if original_dtype in (torch.float8_e4m3fn, torch.float8_e5m2, torch.float8_e4m3fnuz, torch.float8_e5m2fnuz): + logger.warning( + f"Skipping FP8 quantization for key {key} as it is already in FP8 format ({original_dtype}). " + "Loading checkpoint as-is without re-quantization." + ) + target_device = calc_device if (calc_device is not None and move_to_device) else original_device + value = value.to(target_device) + state_dict[key] = value + continue + # Move to calculation device if calc_device is not None: value = value.to(calc_device) - original_dtype = value.dtype quantized_weight, scale_tensor = quantize_weight( key, value, fp8_dtype, max_value, min_value, quantization_mode, block_size )