support original controlnet

2026-04-06 21:52:27 +00:00 · 2023-02-20 12:54:44 +09:00
parent d94c0d70fe
commit 014fd3d037
2 changed files with 406 additions and 18 deletions
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -80,6 +80,8 @@ from PIL import Image
 from PIL.PngImagePlugin import PngInfo

 import library.model_util as model_util
+import tools.original_control_net as original_control_net
+from tools.original_control_net import ControlNetInfo

 # Tokenizer: checkpointから読み込むのではなくあらかじめ提供されているものを使う
 TOKENIZER_PATH = "openai/clip-vit-large-patch14"
@@ -486,6 +488,9 @@ class PipelineLike():
      self.vgg16_feat_model = torchvision.models._utils.IntermediateLayerGetter(vgg16_model.features, return_layers=return_layers)
      self.vgg16_normalize = transforms.Normalize(mean=VGG16_IMAGE_MEAN, std=VGG16_IMAGE_STD)

+    # ControlNet
+    self.control_nets: List[ControlNetInfo] = []
+
  # Textual Inversion
  def add_token_replacement(self, target_token_id, rep_token_ids):
    self.token_replacements[target_token_id] = rep_token_ids
@@ -499,7 +504,11 @@ class PipelineLike():
        new_tokens.append(token)
    return new_tokens

+  def set_control_nets(self, ctrl_nets):
+    self.control_nets = ctrl_nets
+
  # region xformersとか使う部分：独自に書き換えるので関係なし
+
  def enable_xformers_memory_efficient_attention(self):
    r"""
    Enable memory efficient attention as implemented in xformers.
@@ -751,7 +760,7 @@ class PipelineLike():
      text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
      text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)      # prompt複数件でもOK

-    if self.clip_image_guidance_scale > 0 or self.vgg16_guidance_scale > 0 and clip_guide_images is not None:
+    if self.clip_image_guidance_scale > 0 or self.vgg16_guidance_scale > 0 and clip_guide_images is not None or self.control_nets:
      if isinstance(clip_guide_images, PIL.Image.Image):
        clip_guide_images = [clip_guide_images]

@@ -764,7 +773,7 @@ class PipelineLike():
        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
        if len(image_embeddings_clip) == 1:
          image_embeddings_clip = image_embeddings_clip.repeat((batch_size, 1, 1, 1))
-      else:
+      elif self.vgg16_guidance_scale > 0:
        size = (width // VGG16_INPUT_RESIZE_DIV, height // VGG16_INPUT_RESIZE_DIV)            # とりあえず1/4に（小さいか?）
        clip_guide_images = [preprocess_vgg16_guide_image(im, size) for im in clip_guide_images]
        clip_guide_images = torch.cat(clip_guide_images, dim=0)
@@ -773,6 +782,10 @@ class PipelineLike():
        image_embeddings_vgg16 = self.vgg16_feat_model(clip_guide_images)['feat']
        if len(image_embeddings_vgg16) == 1:
          image_embeddings_vgg16 = image_embeddings_vgg16.repeat((batch_size, 1, 1, 1))
+      else:
+        # ControlNetのhintにguide imageを流用する
+        # 前処理はControlNet側で行う
+        pass

    # set timesteps
    self.scheduler.set_timesteps(num_inference_steps, self.device)
@@ -863,12 +876,21 @@ class PipelineLike():
      extra_step_kwargs["eta"] = eta

    num_latent_input = (3 if negative_scale is not None else 2) if do_classifier_free_guidance else 1
+
+    if self.control_nets:
+      guided_hints = original_control_net.get_guided_hints(self.control_nets, num_latent_input, batch_size, clip_guide_images)
+
    for i, t in enumerate(tqdm(timesteps)):
      # expand the latents if we are doing classifier free guidance
      latent_model_input = latents.repeat((num_latent_input, 1, 1, 1))
      latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
      # predict the noise residual
-      noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+      if self.control_nets:
+        noise_pred = original_control_net.call_unet_and_control_net(
+            i, num_latent_input, self.unet, self.control_nets, guided_hints, i / len(timesteps), latent_model_input, t, text_embeddings).sample
+      else:
+        noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

      # perform guidance
      if do_classifier_free_guidance:
@@ -2066,6 +2088,18 @@ def main(args):
  else:
    networks = []

+  # ControlNetの処理
+  control_nets: List[ControlNetInfo] = []
+  if args.control_net_models:
+    for i, model in enumerate(args.control_net_models):
+      prep_type = None if not args.control_net_preps or len(args.control_net_preps) <= i else args.control_net_preps[i]
+      weight = 1.0 if not args.control_net_weights or len(args.control_net_weights) <= i else args.control_net_weights[i]
+      ratio = 1.0 if not args.control_net_ratios or len(args.control_net_ratios) <= i else args.control_net_ratios[i]
+
+      ctrl_unet, ctrl_net = original_control_net.load_control_net(args.v2, unet, model)
+      prep = original_control_net.load_preprocess(prep_type)
+      control_nets.append(ControlNetInfo(ctrl_unet, ctrl_net, prep, weight, ratio))
+
  if args.opt_channels_last:
    print(f"set optimizing: channels last")
    text_encoder.to(memory_format=torch.channels_last)
@@ -2079,9 +2113,14 @@ def main(args):
    if vgg16_model is not None:
      vgg16_model.to(memory_format=torch.channels_last)

+    for cn in control_nets:
+      cn.unet.to(memory_format=torch.channels_last)
+      cn.net.to(memory_format=torch.channels_last)
+
  pipe = PipelineLike(device, vae, text_encoder, tokenizer, unet, scheduler, args.clip_skip,
                      clip_model, args.clip_guidance_scale, args.clip_image_guidance_scale,
                      vgg16_model, args.vgg16_guidance_scale, args.vgg16_guidance_layer)
+  pipe.set_control_nets(control_nets)
  print("pipeline is ready.")

  if args.diffusers_xformers:
@@ -2215,9 +2254,12 @@ def main(args):

  prev_image = None               # for VGG16 guided
  if args.guide_image_path is not None:
-    print(f"load image for CLIP/VGG16 guidance: {args.guide_image_path}")
-    guide_images = load_images(args.guide_image_path)
-    print(f"loaded {len(guide_images)} guide images for CLIP/VGG16 guidance")
+    print(f"load image for CLIP/VGG16/ControlNet guidance: {args.guide_image_path}")
+    guide_images = []
+    for p in args.guide_image_path:
+      guide_images.extend(load_images(p))
+
+    print(f"loaded {len(guide_images)} guide images for guidance")
    if len(guide_images) == 0:
      print(f"No guide image, use previous generated image. / ガイド画像がありません。直前に生成した画像を使います: {args.image_path}")
      guide_images = None
@@ -2257,22 +2299,22 @@ def main(args):
        print("process 1st stage1")
        batch_1st = []
        for base, ext in batch:
-          width_1st = int(width * args.highres_fix_scale + .5)
-          height_1st = int(height * args.highres_fix_scale + .5)
+          width_1st = int(ext.width * args.highres_fix_scale + .5)
+          height_1st = int(ext.height * args.highres_fix_scale + .5)
          width_1st = width_1st - width_1st % 32
          height_1st = height_1st - height_1st % 32

-          bd_1st = BatchData(base, BatchDataExt(width_1st, height_1st, args.highres_fix_steps,
-                             ext.scale, ext.negative_scale, ext.strength, ext.network_muls))
-          batch_1st.append(bd_1st)
+          ext_1st = BatchDataExt(width_1st, height_1st, args.highres_fix_steps, ext.scale,
+                                 ext.negative_scale, ext.strength, ext.network_muls)
+          batch_1st.append(BatchData(base, ext_1st))
        images_1st = process_batch(batch_1st, True, True)

        # 2nd stageのバッチを作成して以下処理する
        print("process 2nd stage1")
        batch_2nd = []
        for i, (bd, image) in enumerate(zip(batch, images_1st)):
-          image = image.resize((width, height), resample=PIL.Image.LANCZOS)      # img2imgとして設定
-          bd_2nd = BatchData(BatchDataBase(*bd.base[0:3], bd.base.seed+1, image, None, *bd.base[6:8]), bd.ext)
+          image = image.resize((bd.ext.width, bd.ext.height), resample=PIL.Image.LANCZOS)      # img2imgとして設定
+          bd_2nd = BatchData(BatchDataBase(*bd.base[0:3], bd.base.seed+1, image, None, *bd.base[6:]), bd.ext)
          batch_2nd.append(bd_2nd)
        batch = batch_2nd

@@ -2328,9 +2370,13 @@ def main(args):
            all_masks_are_same = mask_images[-2] is mask_image

        if guide_image is not None:
-          guide_images.append(guide_image)
-          if i > 0 and all_guide_images_are_same:
-            all_guide_images_are_same = guide_images[-2] is guide_image
+          if type(guide_image) is list:
+            guide_images.extend(guide_image)
+            all_guide_images_are_same = False
+          else:
+            guide_images.append(guide_image)
+            if i > 0 and all_guide_images_are_same:
+              all_guide_images_are_same = guide_images[-2] is guide_image

        # make start code
        torch.manual_seed(seed)
@@ -2353,6 +2399,14 @@ def main(args):
      if guide_images is not None and all_guide_images_are_same:
        guide_images = guide_images[0]

+      # ControlNet使用時はguide imageをリサイズする
+      if control_nets:
+        # TODO resampleのメソッド
+        guide_images = guide_images if type(guide_images) == list else [guide_images]
+        guide_images = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in guide_images]
+        if len(guide_images) == 1:
+          guide_images = guide_images[0]
+
      # generate
      if networks:
        for n, m in zip(networks, network_muls if network_muls else network_default_muls):
@@ -2545,7 +2599,12 @@ def main(args):
          mask_image = mask_images[global_step % len(mask_images)]

        if guide_images is not None:
-          guide_image = guide_images[global_step % len(guide_images)]
+          if control_nets:                                                        # 複数件の場合あり
+            c = len(control_nets)
+            p = global_step % (len(guide_images) // c)
+            guide_image = guide_images[p * c:p * c + c]
+          else:
+            guide_image = guide_images[global_step % len(guide_images)]
        elif args.clip_image_guidance_scale > 0 or args.vgg16_guidance_scale > 0:
          if prev_image is None:
            print("Generate 1st image without guide image.")
@@ -2646,7 +2705,8 @@ if __name__ == '__main__':
                      help='enable VGG16 guided SD by image, scale for guidance / 画像によるVGG16 guided SDを有効にしてこのscaleを適用する')
  parser.add_argument("--vgg16_guidance_layer", type=int, default=20,
                      help='layer of VGG16 to calculate contents guide (1~30, 20 for conv4_2) / VGG16のcontents guideに使うレイヤー番号 (1~30、20はconv4_2)')
-  parser.add_argument("--guide_image_path", type=str, default=None, help="image to CLIP guidance / CLIP guided SDでガイドに使う画像")
+  parser.add_argument("--guide_image_path", type=str, default=None, nargs="*",
+                      help="image to CLIP guidance / CLIP guided SDでガイドに使う画像")
  parser.add_argument("--highres_fix_scale", type=float, default=None,
                      help="enable highres fix, reso scale for 1st stage / highres fixを有効にして最初の解像度をこのscaleにする")
  parser.add_argument("--highres_fix_steps", type=int, default=28,
@@ -2656,5 +2716,13 @@ if __name__ == '__main__':
  parser.add_argument("--negative_scale", type=float, default=None,
                      help="set another guidance scale for negative prompt / ネガティブプロンプトのscaleを指定する")

+  parser.add_argument("--control_net_models", type=str, default=None, nargs='*',
+                      help='ControlNet models to use / 使用するControlNetのモデル名')
+  parser.add_argument("--control_net_preps", type=str, default=None, nargs='*',
+                      help='ControlNet preprocess to use / 使用するControlNetのプリプロセス名')
+  parser.add_argument("--control_net_weights", type=float, default=None, nargs='*', help='ControlNet weights / ControlNetの重み')
+  parser.add_argument("--control_net_ratios", type=float, default=None, nargs='*',
+                      help='ControlNet guidance ratio for steps / ControlNetでガイドするステップ比率')
+
  args = parser.parse_args()
  main(args)
--- a/tools/original_control_net.py
+++ b/tools/original_control_net.py
@@ -0,0 +1,320 @@
+from typing import List, NamedTuple, Any
+import numpy as np
+import cv2
+import torch
+from safetensors.torch import load_file
+
+from diffusers import UNet2DConditionModel
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+
+import library.model_util as model_util
+
+
+class ControlNetInfo(NamedTuple):
+  unet: Any
+  net: Any
+  prep: Any
+  weight: float
+  ratio: float
+
+
+class ControlNet(torch.nn.Module):
+  def __init__(self) -> None:
+    super().__init__()
+
+    # make control model
+    self.control_model = torch.nn.Module()
+
+    dims = [320, 320, 320, 320, 640, 640, 640, 1280, 1280, 1280, 1280, 1280]
+    zero_convs = torch.nn.ModuleList()
+    for i, dim in enumerate(dims):
+      sub_list = torch.nn.ModuleList([torch.nn.Conv2d(dim, dim, 1)])
+      zero_convs.append(sub_list)
+    self.control_model.add_module("zero_convs", zero_convs)
+
+    middle_block_out = torch.nn.Conv2d(1280, 1280, 1)
+    self.control_model.add_module("middle_block_out", torch.nn.ModuleList([middle_block_out]))
+
+    dims = [16, 16, 32, 32, 96, 96, 256, 320]
+    strides = [1, 1, 2, 1, 2, 1, 2, 1]
+    prev_dim = 3
+    input_hint_block = torch.nn.Sequential()
+    for i, (dim, stride) in enumerate(zip(dims, strides)):
+      input_hint_block.append(torch.nn.Conv2d(prev_dim, dim, 3, stride, 1))
+      if i < len(dims) - 1:
+        input_hint_block.append(torch.nn.SiLU())
+      prev_dim = dim
+    self.control_model.add_module("input_hint_block", input_hint_block)
+
+
+def load_control_net(v2, unet, model):
+  device = unet.device
+
+  # control sdからキー変換しつつU-Netに対応する部分のみ取り出し、DiffusersのU-Netに読み込む
+  # state dictを読み込む
+  print(f"ControlNet: loading control SD model : {model}")
+
+  if model_util.is_safetensors(model):
+    ctrl_sd_sd = load_file(model)
+  else:
+    ctrl_sd_sd = torch.load(model, map_location='cpu')
+    ctrl_sd_sd = ctrl_sd_sd.pop("state_dict", ctrl_sd_sd)
+
+  # 重みをU-Netに読み込めるようにする。ControlNetはSD版のstate dictなので、それを読み込む
+  is_difference = "difference" in ctrl_sd_sd
+  print("ControlNet: loading difference")
+
+  # ControlNetには存在しないキーがあるので、まず現在のU-NetでSD版の全keyを作っておく
+  # またTransfer Controlの元weightとなる
+  ctrl_unet_sd_sd = model_util.convert_unet_state_dict_to_sd(v2, unet.state_dict())
+
+  # 元のU-Netに影響しないようにコピーする。またprefixが付いていないので付ける
+  for key in list(ctrl_unet_sd_sd.keys()):
+    ctrl_unet_sd_sd["model.diffusion_model." + key] = ctrl_unet_sd_sd.pop(key).clone()
+
+  zero_conv_sd = {}
+  for key in list(ctrl_sd_sd.keys()):
+    if key.startswith("control_"):
+      unet_key = "model.diffusion_" + key[len("control_"):]
+      if unet_key not in ctrl_unet_sd_sd:               # zero conv
+        zero_conv_sd[key] = ctrl_sd_sd[key]
+        continue
+      if is_difference:                                 # Transfer Control
+        ctrl_unet_sd_sd[unet_key] += ctrl_sd_sd[key].to(device, dtype=unet.dtype)
+      else:
+        ctrl_unet_sd_sd[unet_key] = ctrl_sd_sd[key].to(device, dtype=unet.dtype)
+
+  unet_config = model_util.create_unet_diffusers_config(v2)
+  ctrl_unet_du_sd = model_util.convert_ldm_unet_checkpoint(v2, ctrl_unet_sd_sd, unet_config)    # DiffUsers版ControlNetのstate dict
+
+  # ControlNetのU-Netを作成する
+  ctrl_unet = UNet2DConditionModel(**unet_config)
+  info = ctrl_unet.load_state_dict(ctrl_unet_du_sd)
+  print("ControlNet: loading Control U-Net:", info)
+
+  # U-Net以外のControlNetを作成する
+  # TODO support middle only
+  ctrl_net = ControlNet()
+  info = ctrl_net.load_state_dict(zero_conv_sd)
+  print("ControlNet: loading ControlNet:", info)
+
+  ctrl_unet.to(unet.device, dtype=unet.dtype)
+  ctrl_net.to(unet.device, dtype=unet.dtype)
+  return ctrl_unet, ctrl_net
+
+
+def load_preprocess(prep_type: str):
+  if prep_type is None or prep_type.lower() == "none":
+    return None
+
+  if prep_type.startswith("canny"):
+    args = prep_type.split("_")
+    th1 = int(args[1]) if len(args) >= 2 else 63
+    th2 = int(args[2]) if len(args) >= 3 else 191
+
+    def canny(img):
+      img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+      return cv2.Canny(img, th1, th2)
+    return canny
+
+  print("Unsupported prep type:", prep_type)
+  return None
+
+
+def preprocess_ctrl_net_hint_image(image):
+  image = np.array(image).astype(np.float32) / 255.0
+  image = image[:, :, ::-1].copy()                         # rgb to bgr
+  image = image[None].transpose(0, 3, 1, 2)       # nchw
+  image = torch.from_numpy(image)
+  return image                              # 0 to 1
+
+
+def get_guided_hints(control_nets: List[ControlNetInfo], num_latent_input, b_size, hints):
+  guided_hints = []
+  for i, cnet_info in enumerate(control_nets):
+    # hintは 1枚目の画像のcnet1, 1枚目の画像のcnet2, 1枚目の画像のcnet3, 2枚目の画像のcnet1, 2枚目の画像のcnet2 ... と並んでいること
+    b_hints = []
+    if len(hints) == 1:           # すべて同じ画像をhintとして使う
+      hint = hints[0]
+      if cnet_info.prep is not None:
+        hint = cnet_info.prep(hint)
+      hint = preprocess_ctrl_net_hint_image(hint)
+      b_hints = [hint for _ in range(b_size)]
+    else:
+      for bi in range(b_size):
+        hint = hints[(bi * len(control_nets) + i) % len(hints)]
+        if cnet_info.prep is not None:
+          hint = cnet_info.prep(hint)
+        hint = preprocess_ctrl_net_hint_image(hint)
+        b_hints.append(hint)
+    b_hints = torch.cat(b_hints, dim=0)
+    b_hints = b_hints.to(cnet_info.unet.device, dtype=cnet_info.unet.dtype)
+
+    guided_hint = cnet_info.net.control_model.input_hint_block(b_hints)
+    guided_hints.append(guided_hint)
+  return guided_hints
+
+
+def call_unet_and_control_net(step, num_latent_input, original_unet, control_nets: List[ControlNetInfo], guided_hints, current_ratio, sample, timestep, encoder_hidden_states):
+  # ControlNet
+  # 複数のControlNetの場合は、出力をマージするのではなく交互に適用する
+  cnet_cnt = len(control_nets)
+  cnet_idx = step % cnet_cnt
+  cnet_info = control_nets[cnet_idx]
+
+  # print(current_ratio, cnet_info.prep, cnet_info.weight, cnet_info.ratio)
+  if cnet_info.ratio < current_ratio:
+    return original_unet(sample, timestep, encoder_hidden_states)
+
+  guided_hint = guided_hints[cnet_idx]
+  guided_hint = guided_hint.repeat((num_latent_input, 1, 1, 1))
+  outs = unet_forward(True, cnet_info.net, cnet_info.unet, guided_hint, None, sample, timestep, encoder_hidden_states)
+  outs = [o * cnet_info.weight for o in outs]
+
+  # U-Net
+  return unet_forward(False, cnet_info.net, original_unet, None, outs, sample, timestep, encoder_hidden_states)
+
+
+"""
+  # これはmergeのバージョン
+  # ControlNet
+  cnet_outs_list = []
+  for i, cnet_info in enumerate(control_nets):
+    # print(current_ratio, cnet_info.prep, cnet_info.weight, cnet_info.ratio)
+    if cnet_info.ratio < current_ratio:
+      continue
+    guided_hint = guided_hints[i]
+    outs = unet_forward(True, cnet_info.net, cnet_info.unet, guided_hint, None, sample, timestep, encoder_hidden_states)
+    for i in range(len(outs)):
+      outs[i] *= cnet_info.weight
+
+    cnet_outs_list.append(outs)
+
+  count = len(cnet_outs_list)
+  if count == 0:
+    return original_unet(sample, timestep, encoder_hidden_states)
+
+  # sum of controlnets
+  for i in range(1, count):
+    cnet_outs_list[0] += cnet_outs_list[i]
+
+  # U-Net
+  return unet_forward(False, cnet_info.net, original_unet, None, cnet_outs_list[0], sample, timestep, encoder_hidden_states)
+"""
+
+
+def unet_forward(is_control_net, control_net: ControlNet, unet: UNet2DConditionModel, guided_hint, ctrl_outs, sample, timestep, encoder_hidden_states):
+  # copy from UNet2DConditionModel
+  default_overall_up_factor = 2**unet.num_upsamplers
+
+  forward_upsample_size = False
+  upsample_size = None
+
+  if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+    print("Forward upsample size to force interpolation output size.")
+    forward_upsample_size = True
+
+  # 0. center input if necessary
+  if unet.config.center_input_sample:
+    sample = 2 * sample - 1.0
+
+  # 1. time
+  timesteps = timestep
+  if not torch.is_tensor(timesteps):
+    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+    # This would be a good case for the `match` statement (Python 3.10+)
+    is_mps = sample.device.type == "mps"
+    if isinstance(timestep, float):
+      dtype = torch.float32 if is_mps else torch.float64
+    else:
+      dtype = torch.int32 if is_mps else torch.int64
+    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+  elif len(timesteps.shape) == 0:
+    timesteps = timesteps[None].to(sample.device)
+
+  # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+  timesteps = timesteps.expand(sample.shape[0])
+
+  t_emb = unet.time_proj(timesteps)
+
+  # timesteps does not contain any weights and will always return f32 tensors
+  # but time_embedding might actually be running in fp16. so we need to cast here.
+  # there might be better ways to encapsulate this.
+  t_emb = t_emb.to(dtype=unet.dtype)
+  emb = unet.time_embedding(t_emb)
+
+  outs = []                     # output of ControlNet
+  zc_idx = 0
+
+  # 2. pre-process
+  sample = unet.conv_in(sample)
+  if is_control_net:
+    sample += guided_hint
+    outs.append(control_net.control_model.zero_convs[zc_idx][0](sample))  # , emb, encoder_hidden_states))
+    zc_idx += 1
+
+  # 3. down
+  down_block_res_samples = (sample,)
+  for downsample_block in unet.down_blocks:
+    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+      sample, res_samples = downsample_block(
+          hidden_states=sample,
+          temb=emb,
+          encoder_hidden_states=encoder_hidden_states,
+      )
+    else:
+      sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+    if is_control_net:
+      for rs in res_samples:
+        outs.append(control_net.control_model.zero_convs[zc_idx][0](rs))  # , emb, encoder_hidden_states))
+        zc_idx += 1
+
+    down_block_res_samples += res_samples
+
+  # 4. mid
+  sample = unet.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+  if is_control_net:
+    outs.append(control_net.control_model.middle_block_out[0](sample))
+    return outs
+
+  if not is_control_net:
+    sample += ctrl_outs.pop()
+
+  # 5. up
+  for i, upsample_block in enumerate(unet.up_blocks):
+    is_final_block = i == len(unet.up_blocks) - 1
+
+    res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+    if not is_control_net and len(ctrl_outs) > 0:
+      res_samples = list(res_samples)
+      apply_ctrl_outs = ctrl_outs[-len(res_samples):]
+      ctrl_outs = ctrl_outs[:-len(res_samples)]
+      for j in range(len(res_samples)):
+        res_samples[j] = res_samples[j] + apply_ctrl_outs[j]
+      res_samples = tuple(res_samples)
+
+    # if we have not reached the final block and need to forward the
+    # upsample size, we do it here
+    if not is_final_block and forward_upsample_size:
+      upsample_size = down_block_res_samples[-1].shape[2:]
+
+    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+      sample = upsample_block(
+          hidden_states=sample,
+          temb=emb,
+          res_hidden_states_tuple=res_samples,
+          encoder_hidden_states=encoder_hidden_states,
+          upsample_size=upsample_size,
+      )
+    else:
+      sample = upsample_block(
+          hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+      )
+  # 6. post-process
+  sample = unet.conv_norm_out(sample)
+  sample = unet.conv_act(sample)
+  sample = unet.conv_out(sample)
+
+  return UNet2DConditionOutput(sample=sample)