diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md index 658a7beb..165c3df4 100644 --- a/docs/hunyuan_image_train_network.md +++ b/docs/hunyuan_image_train_network.md @@ -454,6 +454,9 @@ python hunyuan_image_minimal_inference.py \ - `--flow_shift`: Flow matching shift parameter (default: 5.0) - `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage - `--vae_chunk_size`: Chunk size for VAE decoding to reduce memory usage (default: None, no chunking). 16 is recommended if enabled. +- `--apg_start_step_general` and `--apg_start_step_ocr`: Start steps for APG (Adaptive Projected Guidance) if using APG during inference. `5` and `38` are the official recommended values for 50 steps. If this value exceeds `--infer_steps`, APG will not be applied. +- `--guidance_rescale`: Rescales the guidance for steps before APG starts. Default is `0.0` (no rescaling). If you use this option, a value around `0.5` might be good starting point. +- `--guidance_rescale_apg`: Rescales the guidance for APG. Default is `0.0` (no rescaling). This option doesn't seem to have a large effect, but if you use it, a value around `0.5` might be a good starting point. `--split_attn` is not supported (since inference is done one at a time). `--fp8_vl` is not supported, please use CPU for the text encoder if VRAM is insufficient. @@ -470,6 +473,9 @@ python hunyuan_image_minimal_inference.py \ - `--flow_shift`: Flow Matchingシフトパラメータ(デフォルト: 5.0) - `--text_encoder_cpu`: テキストエンコーダをCPUで実行してVRAM使用量削減 - `--vae_chunk_size`: VAEデコーディングのチャンクサイズ(デフォルト: None、チャンク処理なし)。有効にする場合は16を推奨。 +- `--apg_start_step_general` と `--apg_start_step_ocr`: 推論中にAPGを使用する場合の開始ステップ。50ステップの場合、公式推奨値はそれぞれ5と38です。この値が`--infer_steps`を超えると、APGは適用されません。 +- `--guidance_rescale`: APG開始前のステップに対するガイダンスのリスケーリング。デフォルトは0.0(リスケーリングなし)。使用する場合、0.5程度から始めて調整してください。 +- `--guidance_rescale_apg`: APGに対するガイダンスのリスケーリング。デフォルトは0.0(リスケーリングなし)。このオプションは大きな効果はないようですが、使用する場合は0.5程度から始めて調整してください。 `--split_attn`はサポートされていません(1件ずつ推論するため)。`--fp8_vl`もサポートされていません。VRAMが不足する場合はテキストエンコーダをCPUで実行してください。 diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py index d0184feb..3f63270b 100644 --- a/hunyuan_image_minimal_inference.py +++ b/hunyuan_image_minimal_inference.py @@ -85,7 +85,13 @@ def parse_args() -> argparse.Namespace: "--guidance_rescale", type=float, default=0.0, - help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale)." + help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale).", + ) + parser.add_argument( + "--guidance_rescale_apg", + type=float, + default=0.0, + help="Guidance rescale factor for steps with APG, 0.0 to 1.0. Default is 0.0 (no rescale).", ) parser.add_argument("--prompt", type=str, default=None, help="prompt for generation") parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string") @@ -695,10 +701,18 @@ def generate_body( # Prepare Guider cfg_guider_ocr = hunyuan_image_utils.AdaptiveProjectedGuidance( - guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5 + guidance_scale=10.0, + eta=0.0, + adaptive_projected_guidance_rescale=10.0, + adaptive_projected_guidance_momentum=-0.5, + guidance_rescale=args.guidance_rescale_apg, ) cfg_guider_general = hunyuan_image_utils.AdaptiveProjectedGuidance( - guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5 + guidance_scale=10.0, + eta=0.0, + adaptive_projected_guidance_rescale=10.0, + adaptive_projected_guidance_momentum=-0.5, + guidance_rescale=args.guidance_rescale_apg, ) # Denoising loop diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py index a1e7d4e9..3b0d68fd 100644 --- a/library/hunyuan_image_utils.py +++ b/library/hunyuan_image_utils.py @@ -401,8 +401,6 @@ class AdaptiveProjectedGuidance: guidance_rescale: float = 0.0, use_original_formulation: bool = False, ): - assert guidance_rescale == 0.0, "guidance_rescale > 0.0 not supported." - self.guidance_scale = guidance_scale self.adaptive_projected_guidance_momentum = adaptive_projected_guidance_momentum self.adaptive_projected_guidance_rescale = adaptive_projected_guidance_rescale @@ -425,6 +423,10 @@ class AdaptiveProjectedGuidance: self.use_original_formulation, ) + if self.guidance_rescale > 0.0: + print(f"Applying guidance rescale with factor {self.guidance_rescale} at step {step}") + pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale) + return pred