diff --git a/README.md b/README.md index 017f4853..b3a0b6d5 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,16 @@ __Stable Diffusion web UI now seems to support LoRA trained by ``sd-scripts``.__ The feature of SDXL training is now available in sdxl branch as an experimental feature. -Aug 4, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. +Aug 6, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. + +- [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet. + - The main items are set automatically. + - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script. + - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage. + - Metadata editor will be available soon. +- SDXL LoRA has `sdxl_base_v1-0` now for `ss_base_model_version` metadata item, instead of `v0-9`. + +Aug 4, 2023: - `bitsandbytes` is now optional. Please install it if you want to use it. The insructions are in the later section. - `albumentations` is not required anymore. @@ -218,7 +227,7 @@ For 8bit optimizer, you need to install `bitsandbytes`. For Linux, please instal For Windows, there are several versions of `bitsandbytes`: - `bitsandbytes` 0.35.0: Stable version. AdamW8bit is available. `full_bf16` is not available. -- `bitsandbytes` 0.39.1: Lion8bit, PagedAdamW8bit and PagedLion8bit are available. `full_bf16` is available. +- `bitsandbytes` 0.41.1: Lion8bit, PagedAdamW8bit and PagedLion8bit are available. `full_bf16` is available. Note: `bitsandbytes`above 0.35.0 till 0.41.0 seems to have an issue: https://github.com/TimDettmers/bitsandbytes/issues/659 @@ -240,12 +249,12 @@ cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_set This will install `bitsandbytes` 0.35.0 and copy the necessary files to the `bitsandbytes` directory. -### bitsandbytes 0.39.1 for Windows +### bitsandbytes 0.41.1 for Windows Install the Windows version whl file from [here](https://github.com/jllllll/bitsandbytes-windows-webui) or other sources, like: ```powershell -pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl +python -m pip install bitsandbytes==0.41.1 --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui ``` ## Upgrade diff --git a/library/huggingface_util.py b/library/huggingface_util.py index 1dc496ff..376fdb1e 100644 --- a/library/huggingface_util.py +++ b/library/huggingface_util.py @@ -26,7 +26,7 @@ def upload( repo_id = args.huggingface_repo_id repo_type = args.huggingface_repo_type token = args.huggingface_token - path_in_repo = args.huggingface_path_in_repo + dest_suffix + path_in_repo = args.huggingface_path_in_repo + dest_suffix if args.huggingface_path_in_repo is not None else None private = args.huggingface_repo_visibility is None or args.huggingface_repo_visibility != "public" api = HfApi(token=token) if not exists_repo(repo_id=repo_id, repo_type=repo_type, token=token): diff --git a/library/model_util.py b/library/model_util.py index 42146b94..860c170b 100644 --- a/library/model_util.py +++ b/library/model_util.py @@ -563,10 +563,10 @@ def convert_ldm_clip_checkpoint_v1(checkpoint): for key in keys: if key.startswith("cond_stage_model.transformer"): text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] - + # support checkpoint without position_ids (invalid checkpoint) if "text_model.embeddings.position_ids" not in text_model_dict: - text_model_dict["text_model.embeddings.position_ids"] = torch.arange(77).unsqueeze(0) # 77 is the max length of the text + text_model_dict["text_model.embeddings.position_ids"] = torch.arange(77).unsqueeze(0) # 77 is the max length of the text return text_model_dict @@ -759,6 +759,7 @@ def convert_unet_state_dict_to_sd(v2, unet_state_dict): return new_state_dict + def controlnet_conversion_map(): unet_conversion_map = [ ("time_embed.0.weight", "time_embedding.linear_1.weight"), @@ -806,9 +807,7 @@ def controlnet_conversion_map(): sd_mid_res_prefix = f"middle_block.{2*j}." unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix)) - controlnet_cond_embedding_names = ( - ["conv_in"] + [f"blocks.{i}" for i in range(6)] + ["conv_out"] - ) + controlnet_cond_embedding_names = ["conv_in"] + [f"blocks.{i}" for i in range(6)] + ["conv_out"] for i, hf_prefix in enumerate(controlnet_cond_embedding_names): hf_prefix = f"controlnet_cond_embedding.{hf_prefix}." sd_prefix = f"input_hint_block.{i*2}." @@ -840,6 +839,7 @@ def convert_controlnet_state_dict_to_sd(controlnet_state_dict): new_state_dict = {v: controlnet_state_dict[k] for k, v in mapping.items()} return new_state_dict + def convert_controlnet_state_dict_to_diffusers(controlnet_state_dict): unet_conversion_map, unet_conversion_map_resnet, unet_conversion_map_layer = controlnet_conversion_map() @@ -858,6 +858,7 @@ def convert_controlnet_state_dict_to_diffusers(controlnet_state_dict): new_state_dict = {v: controlnet_state_dict[k] for k, v in mapping.items()} return new_state_dict + # ================# # VAE Conversion # # ================# @@ -1066,6 +1067,7 @@ def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dt return text_model, vae, unet + def get_model_version_str_for_sd1_sd2(v2, v_parameterization): # only for reference version_str = "sd" @@ -1077,6 +1079,7 @@ def get_model_version_str_for_sd1_sd2(v2, v_parameterization): version_str += "_v" return version_str + def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False): def convert_key(key): # position_idsの除去 @@ -1148,7 +1151,9 @@ def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=Fals return new_sd -def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None, vae=None): +def save_stable_diffusion_checkpoint( + v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, metadata, save_dtype=None, vae=None +): if ckpt_path is not None: # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path) @@ -1210,7 +1215,7 @@ def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_p if is_safetensors(output_file): # TODO Tensor以外のdictの値を削除したほうがいいか - save_file(state_dict, output_file) + save_file(state_dict, output_file, metadata) else: torch.save(new_ckpt, output_file) diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py new file mode 100644 index 00000000..88c2cb77 --- /dev/null +++ b/library/sai_model_spec.py @@ -0,0 +1,301 @@ +# based on https://github.com/Stability-AI/ModelSpec +import datetime +import hashlib +from io import BytesIO +import os +from typing import List, Optional, Tuple, Union +import safetensors + +r""" +# Metadata Example +metadata = { + # === Must === + "modelspec.sai_model_spec": "1.0.0", # Required version ID for the spec + "modelspec.architecture": "stable-diffusion-xl-v1-base", # Architecture, reference the ID of the original model of the arch to match the ID + "modelspec.implementation": "sgm", + "modelspec.title": "Example Model Version 1.0", # Clean, human-readable title. May use your own phrasing/language/etc + # === Should === + "modelspec.author": "Example Corp", # Your name or company name + "modelspec.description": "This is my example model to show you how to do it!", # Describe the model in your own words/language/etc. Focus on what users need to know + "modelspec.date": "2023-07-20", # ISO-8601 compliant date of when the model was created + # === Can === + "modelspec.license": "ExampleLicense-1.0", # eg CreativeML Open RAIL, etc. + "modelspec.usage_hint": "Use keyword 'example'" # In your own language, very short hints about how the user should use the model +} +""" + +BASE_METADATA = { + # === Must === + "modelspec.sai_model_spec": "1.0.0", # Required version ID for the spec + "modelspec.architecture": None, + "modelspec.implementation": None, + "modelspec.title": None, + "modelspec.resolution": None, + # === Should === + "modelspec.description": None, + "modelspec.author": None, + "modelspec.date": None, + # === Can === + "modelspec.license": None, + "modelspec.tags": None, + "modelspec.merged_from": None, + "modelspec.prediction_type": None, + "modelspec.timestep_range": None, + "modelspec.encoder_layer": None, +} + +# 別に使うやつだけ定義 +MODELSPEC_TITLE = "modelspec.title" + +ARCH_SD_V1 = "stable-diffusion-v1" +ARCH_SD_V2_512 = "stable-diffusion-v2-512" +ARCH_SD_V2_768_V = "stable-diffusion-v2-768-v" +ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base" + +ADAPTER_LORA = "lora" +ADAPTER_TEXTUAL_INVERSION = "textual-inversion" + +IMPL_STABILITY_AI = "https://github.com/Stability-AI/generative-models" +IMPL_DIFFUSERS = "diffusers" + +PRED_TYPE_EPSILON = "epsilon" +PRED_TYPE_V = "v" + + +def load_bytes_in_safetensors(tensors): + bytes = safetensors.torch.save(tensors) + b = BytesIO(bytes) + + b.seek(0) + header = b.read(8) + n = int.from_bytes(header, "little") + + offset = n + 8 + b.seek(offset) + + return b.read() + + +def precalculate_safetensors_hashes(state_dict): + # calculate each tensor one by one to reduce memory usage + hash_sha256 = hashlib.sha256() + for tensor in state_dict.values(): + single_tensor_sd = {"tensor": tensor} + bytes_for_tensor = load_bytes_in_safetensors(single_tensor_sd) + hash_sha256.update(bytes_for_tensor) + + return f"0x{hash_sha256.hexdigest()}" + + +def update_hash_sha256(metadata: dict, state_dict: dict): + raise NotImplementedError + + +def build_metadata( + state_dict: Optional[dict], + v2: bool, + v_parameterization: bool, + sdxl: bool, + lora: bool, + textual_inversion: bool, + timestamp: float, + title: Optional[str] = None, + reso: Optional[Union[int, Tuple[int, int]]] = None, + is_stable_diffusion_ckpt: Optional[bool] = None, + author: Optional[str] = None, + description: Optional[str] = None, + license: Optional[str] = None, + tags: Optional[str] = None, + merged_from: Optional[str] = None, + timesteps: Optional[Tuple[int, int]] = None, + clip_skip: Optional[int] = None, +): + # if state_dict is None, hash is not calculated + + metadata = {} + metadata.update(BASE_METADATA) + + # TODO メモリを消費せずかつ正しいハッシュ計算の方法がわかったら実装する + # if state_dict is not None: + # hash = precalculate_safetensors_hashes(state_dict) + # metadata["modelspec.hash_sha256"] = hash + + if sdxl: + arch = ARCH_SD_XL_V1_BASE + elif v2: + if v_parameterization: + arch = ARCH_SD_V2_768_V + else: + arch = ARCH_SD_V2_512 + else: + arch = ARCH_SD_V1 + + if lora: + arch += f"/{ADAPTER_LORA}" + elif textual_inversion: + arch += f"/{ADAPTER_TEXTUAL_INVERSION}" + + metadata["modelspec.architecture"] = arch + + if not lora and not textual_inversion and is_stable_diffusion_ckpt is None: + is_stable_diffusion_ckpt = True # default is stable diffusion ckpt if not lora and not textual_inversion + + if (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt: + # Stable Diffusion ckpt, TI, SDXL LoRA + impl = IMPL_STABILITY_AI + else: + # v1/v2 LoRA or Diffusers + impl = IMPL_DIFFUSERS + metadata["modelspec.implementation"] = impl + + if title is None: + if lora: + title = "LoRA" + elif textual_inversion: + title = "TextualInversion" + else: + title = "Checkpoint" + title += f"@{timestamp}" + metadata[MODELSPEC_TITLE] = title + + if author is not None: + metadata["modelspec.author"] = author + else: + del metadata["modelspec.author"] + + if description is not None: + metadata["modelspec.description"] = description + else: + del metadata["modelspec.description"] + + if merged_from is not None: + metadata["modelspec.merged_from"] = merged_from + else: + del metadata["modelspec.merged_from"] + + if license is not None: + metadata["modelspec.license"] = license + else: + del metadata["modelspec.license"] + + if tags is not None: + metadata["modelspec.tags"] = tags + else: + del metadata["modelspec.tags"] + + # remove microsecond from time + int_ts = int(timestamp) + + # time to iso-8601 compliant date + date = datetime.datetime.fromtimestamp(int_ts).isoformat() + metadata["modelspec.date"] = date + + if reso is not None: + # comma separated to tuple + if isinstance(reso, str): + reso = tuple(map(int, reso.split(","))) + if len(reso) == 1: + reso = (reso[0], reso[0]) + else: + # resolution is defined in dataset, so use default + if sdxl: + reso = 1024 + elif v2 and v_parameterization: + reso = 768 + else: + reso = 512 + if isinstance(reso, int): + reso = (reso, reso) + + metadata["modelspec.resolution"] = f"{reso[0]}x{reso[1]}" + + if v_parameterization: + metadata["modelspec.prediction_type"] = PRED_TYPE_V + else: + metadata["modelspec.prediction_type"] = PRED_TYPE_EPSILON + + if timesteps is not None: + metadata["modelspec.timestep_range"] = timesteps + else: + del metadata["modelspec.timestep_range"] + + if clip_skip is not None: + metadata["modelspec.encoder_layer"] = f"{clip_skip}" + else: + del metadata["modelspec.encoder_layer"] + + # # assert all values are filled + # assert all([v is not None for v in metadata.values()]), metadata + if not all([v is not None for v in metadata.values()]): + print(f"Internal error: some metadata values are None: {metadata}") + + return metadata + + +# region utils + + +def get_title(metadata: dict) -> Optional[str]: + return metadata.get(MODELSPEC_TITLE, None) + + +def load_metadata_from_safetensors(model: str) -> dict: + if not model.endswith(".safetensors"): + return {} + + with safetensors.safe_open(model, framework="pt") as f: + metadata = f.metadata() + if metadata is None: + metadata = {} + return metadata + + +def build_merged_from(models: List[str]) -> str: + def get_title(model: str): + metadata = load_metadata_from_safetensors(model) + title = metadata.get(MODELSPEC_TITLE, None) + if title is None: + title = os.path.splitext(os.path.basename(model))[0] # use filename + return title + + titles = [get_title(model) for model in models] + return ", ".join(titles) + + +# endregion + + +r""" +if __name__ == "__main__": + import argparse + import torch + from safetensors.torch import load_file + from library import train_util + + parser = argparse.ArgumentParser() + parser.add_argument("--ckpt", type=str, required=True) + args = parser.parse_args() + + print(f"Loading {args.ckpt}") + state_dict = load_file(args.ckpt) + + print(f"Calculating metadata") + metadata = get(state_dict, False, False, False, False, "sgm", False, False, "title", "date", 256, 1000, 0) + print(metadata) + del state_dict + + # by reference implementation + with open(args.ckpt, mode="rb") as file_data: + file_hash = hashlib.sha256() + head_len = struct.unpack("Q", file_data.read(8)) # int64 header length prefix + header = json.loads(file_data.read(head_len[0])) # header itself, json string + content = ( + file_data.read() + ) # All other content is tightly packed tensors. Copy to RAM for simplicity, but you can avoid this read with a more careful FS-dependent impl. + file_hash.update(content) + # ===== Update the hash for modelspec ===== + by_ref = f"0x{file_hash.hexdigest()}" + print(by_ref) + print("is same?", by_ref == metadata["modelspec.hash_sha256"]) + +""" diff --git a/library/sdxl_model_util.py b/library/sdxl_model_util.py index 07ee3016..807e0aec 100644 --- a/library/sdxl_model_util.py +++ b/library/sdxl_model_util.py @@ -10,10 +10,10 @@ from library import sdxl_original_unet VAE_SCALE_FACTOR = 0.13025 -MODEL_VERSION_SDXL_BASE_V0_9 = "sdxl_base_v0-9" +MODEL_VERSION_SDXL_BASE_V1_0 = "sdxl_base_v1-0" # Diffusersの設定を読み込むための参照モデル -DIFFUSERS_REF_MODEL_ID_SDXL = "stabilityai/stable-diffusion-xl-base-0.9" # アクセス権が必要 +DIFFUSERS_REF_MODEL_ID_SDXL = "stabilityai/stable-diffusion-xl-base-1.0" DIFFUSERS_SDXL_UNET_CONFIG = { "act_fn": "silu", @@ -468,6 +468,7 @@ def save_stable_diffusion_checkpoint( ckpt_info, vae, logit_scale, + metadata, save_dtype=None, ): state_dict = {} @@ -505,7 +506,7 @@ def save_stable_diffusion_checkpoint( new_ckpt["global_step"] = steps if model_util.is_safetensors(output_file): - save_file(state_dict, output_file) + save_file(state_dict, output_file, metadata) else: torch.save(new_ckpt, output_file) diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py index 9919df0d..1f849275 100644 --- a/library/sdxl_train_util.py +++ b/library/sdxl_train_util.py @@ -76,7 +76,9 @@ def _load_target_model(name_or_path: str, vae_path: Optional[str], model_version print(f"load Diffusers pretrained models: {name_or_path}, variant={variant}") try: try: - pipe = StableDiffusionXLPipeline.from_pretrained(name_or_path, torch_dtype=weight_dtype, variant=variant, tokenizer=None) + pipe = StableDiffusionXLPipeline.from_pretrained( + name_or_path, torch_dtype=weight_dtype, variant=variant, tokenizer=None + ) except EnvironmentError as ex: if variant is not None: print("try to load fp32 model") @@ -98,7 +100,7 @@ def _load_target_model(name_or_path: str, vae_path: Optional[str], model_version # Diffusers U-Net to original U-Net state_dict = sdxl_model_util.convert_diffusers_unet_state_dict_to_sdxl(unet.state_dict()) with init_empty_weights(): - unet = sdxl_original_unet.SdxlUNet2DConditionModel() # overwrite unet + unet = sdxl_original_unet.SdxlUNet2DConditionModel() # overwrite unet sdxl_model_util._load_state_dict_on_device(unet, state_dict, device=device) print("U-Net converted to original U-Net") @@ -197,6 +199,7 @@ def save_sd_model_on_train_end( ckpt_info, ): def sd_saver(ckpt_file, epoch_no, global_step): + sai_metadata = train_util.get_sai_model_spec(None, args, True, False, False, is_stable_diffusion_ckpt=True) sdxl_model_util.save_stable_diffusion_checkpoint( ckpt_file, text_encoder1, @@ -207,6 +210,7 @@ def save_sd_model_on_train_end( ckpt_info, vae, logit_scale, + sai_metadata, save_dtype, ) @@ -248,6 +252,7 @@ def save_sd_model_on_epoch_end_or_stepwise( ckpt_info, ): def sd_saver(ckpt_file, epoch_no, global_step): + sai_metadata = train_util.get_sai_model_spec(None, args, True, False, False, is_stable_diffusion_ckpt=True) sdxl_model_util.save_stable_diffusion_checkpoint( ckpt_file, text_encoder1, @@ -258,6 +263,7 @@ def save_sd_model_on_epoch_end_or_stepwise( ckpt_info, vae, logit_scale, + sai_metadata, save_dtype, ) diff --git a/library/train_util.py b/library/train_util.py index c5e903bb..dbfe41e8 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -58,12 +58,11 @@ from huggingface_hub import hf_hub_download import numpy as np from PIL import Image import cv2 -from einops import rearrange -from torch import einsum import safetensors.torch from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline import library.model_util as model_util import library.huggingface_util as huggingface_util +import library.sai_model_spec as sai_model_spec # from library.attention_processors import FlashAttnProcessor # from library.hypernetwork import replace_attentions_for_hypernetwork @@ -2460,6 +2459,106 @@ def replace_vae_attn_to_memory_efficient(): # region arguments +def load_metadata_from_safetensors(safetensors_file: str) -> dict: + """r + This method locks the file. see https://github.com/huggingface/safetensors/issues/164 + If the file isn't .safetensors or doesn't have metadata, return empty dict. + """ + if os.path.splitext(safetensors_file)[1] != ".safetensors": + return {} + + with safetensors.safe_open(safetensors_file, framework="pt", device="cpu") as f: + metadata = f.metadata() + if metadata is None: + metadata = {} + return metadata + + +# this metadata is referred from train_network and various scripts, so we wrote here +SS_METADATA_KEY_V2 = "ss_v2" +SS_METADATA_KEY_BASE_MODEL_VERSION = "ss_base_model_version" +SS_METADATA_KEY_NETWORK_MODULE = "ss_network_module" +SS_METADATA_KEY_NETWORK_DIM = "ss_network_dim" +SS_METADATA_KEY_NETWORK_ALPHA = "ss_network_alpha" +SS_METADATA_KEY_NETWORK_ARGS = "ss_network_args" + +SS_METADATA_MINIMUM_KEYS = [ + SS_METADATA_KEY_V2, + SS_METADATA_KEY_BASE_MODEL_VERSION, + SS_METADATA_KEY_NETWORK_MODULE, + SS_METADATA_KEY_NETWORK_DIM, + SS_METADATA_KEY_NETWORK_ALPHA, + SS_METADATA_KEY_NETWORK_ARGS, +] + + +def build_minimum_network_metadata( + v2: Optional[bool], + base_model: Optional[str], + network_module: str, + network_dim: str, + network_alpha: str, + network_args: Optional[dict], +): + # old LoRA doesn't have base_model + metadata = { + SS_METADATA_KEY_NETWORK_MODULE: network_module, + SS_METADATA_KEY_NETWORK_DIM: network_dim, + SS_METADATA_KEY_NETWORK_ALPHA: network_alpha, + } + if v2 is not None: + metadata[SS_METADATA_KEY_V2] = v2 + if base_model is not None: + metadata[SS_METADATA_KEY_BASE_MODEL_VERSION] = base_model + if network_args is not None: + metadata[SS_METADATA_KEY_NETWORK_ARGS] = json.dumps(network_args) + return metadata + + +def get_sai_model_spec( + state_dict: dict, + args: argparse.Namespace, + sdxl: bool, + lora: bool, + textual_inversion: bool, + is_stable_diffusion_ckpt: Optional[bool] = None, # None for TI and LoRA +): + timestamp = time.time() + + v2 = args.v2 + v_parameterization = args.v_parameterization + reso = args.resolution + + title = args.metadata_title if args.metadata_title is not None else args.output_name + + if args.min_timestep is not None or args.max_timestep is not None: + min_time_step = args.min_timestep if args.min_timestep is not None else 0 + max_time_step = args.max_timestep if args.max_timestep is not None else 1000 + timesteps = (min_time_step, max_time_step) + else: + timesteps = None + + metadata = sai_model_spec.build_metadata( + state_dict, + v2, + v_parameterization, + sdxl, + lora, + textual_inversion, + timestamp, + title, + reso, + is_stable_diffusion_ckpt, + args.metadata_author, + args.metadata_description, + args.metadata_license, + args.metadata_tags, + timesteps, + args.clip_skip, # None or int + ) + return metadata + + def add_sd_models_arguments(parser: argparse.ArgumentParser): # for pretrained models parser.add_argument("--v2", action="store_true", help="load Stable Diffusion v2.0 model / Stable Diffusion 2.0のモデルを読み込む") @@ -2830,6 +2929,38 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: "--output_config", action="store_true", help="output command line args to given .toml file / 引数を.tomlファイルに出力する" ) + # SAI Model spec + parser.add_argument( + "--metadata_title", + type=str, + default=None, + help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name", + ) + parser.add_argument( + "--metadata_author", + type=str, + default=None, + help="author name for model metadata / メタデータに書き込まれるモデル作者名", + ) + parser.add_argument( + "--metadata_description", + type=str, + default=None, + help="description for model metadata / メタデータに書き込まれるモデル説明", + ) + parser.add_argument( + "--metadata_license", + type=str, + default=None, + help="license for model metadata / メタデータに書き込まれるモデルライセンス", + ) + parser.add_argument( + "--metadata_tags", + type=str, + default=None, + help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り", + ) + if support_dreambooth: # DreamBooth training parser.add_argument( @@ -3893,8 +4024,9 @@ def save_sd_model_on_epoch_end_or_stepwise( vae, ): def sd_saver(ckpt_file, epoch_no, global_step): + sai_metadata = get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True) model_util.save_stable_diffusion_checkpoint( - args.v2, ckpt_file, text_encoder, unet, src_path, epoch_no, global_step, save_dtype, vae + args.v2, ckpt_file, text_encoder, unet, src_path, epoch_no, global_step, sai_metadata, save_dtype, vae ) def diffusers_saver(out_dir): @@ -4074,8 +4206,9 @@ def save_sd_model_on_train_end( vae, ): def sd_saver(ckpt_file, epoch_no, global_step): + sai_metadata = get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True) model_util.save_stable_diffusion_checkpoint( - args.v2, ckpt_file, text_encoder, unet, src_path, epoch_no, global_step, save_dtype, vae + args.v2, ckpt_file, text_encoder, unet, src_path, epoch_no, global_step, sai_metadata, save_dtype, vae ) def diffusers_saver(out_dir): diff --git a/networks/extract_lora_from_models.py b/networks/extract_lora_from_models.py index 0bc1afe0..b4eb0cf7 100644 --- a/networks/extract_lora_from_models.py +++ b/networks/extract_lora_from_models.py @@ -5,9 +5,11 @@ import argparse import json import os +import time import torch from safetensors.torch import load_file, save_file from tqdm import tqdm +from library import sai_model_spec import library.model_util as model_util import library.sdxl_model_util as sdxl_model_util import lora @@ -59,15 +61,15 @@ def svd(args): else: print(f"loading original SDXL model : {args.model_org}") text_encoder_o1, text_encoder_o2, _, unet_o, _, _ = sdxl_model_util.load_models_from_sdxl_checkpoint( - sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, args.model_org, "cpu" + sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, args.model_org, "cpu" ) text_encoders_o = [text_encoder_o1, text_encoder_o2] print(f"loading original SDXL model : {args.model_tuned}") text_encoder_t1, text_encoder_t2, _, unet_t, _, _ = sdxl_model_util.load_models_from_sdxl_checkpoint( - sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, args.model_tuned, "cpu" + sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, args.model_tuned, "cpu" ) text_encoders_t = [text_encoder_t1, text_encoder_t2] - model_version = sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9 + model_version = sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0 # create LoRA network to extract weights: Use dim (rank) as alpha if args.conv_dim is None: @@ -197,6 +199,13 @@ def svd(args): "ss_network_args": json.dumps(net_kwargs), } + if not args.no_metadata: + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + None, args.v2, args.v_parameterization, False, True, False, time.time(), title=title + ) + metadata.update(sai_metadata) + lora_network_save.save_weights(args.save_to, save_dtype, metadata) print(f"LoRA weights are saved to: {args.save_to}") @@ -243,6 +252,12 @@ def setup_parser() -> argparse.ArgumentParser: help="dimension (rank) of LoRA for Conv2d-3x3 (default None, disabled) / LoRAのConv2d-3x3の次元数(rank)(デフォルトNone、適用なし)", ) parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う") + parser.add_argument( + "--no_metadata", + action="store_true", + help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / " + + "sai modelspecのメタデータを保存しない(LoRAの最低限のss_metadataは保存される)", + ) return parser diff --git a/networks/merge_lora.py b/networks/merge_lora.py index 2fa8861b..c8d743f5 100644 --- a/networks/merge_lora.py +++ b/networks/merge_lora.py @@ -1,8 +1,10 @@ import math import argparse import os +import time import torch from safetensors.torch import load_file, save_file +from library import sai_model_spec, train_util import library.model_util as model_util import lora @@ -10,22 +12,26 @@ import lora def load_state_dict(file_name, dtype): if os.path.splitext(file_name)[1] == ".safetensors": sd = load_file(file_name) + metadata = train_util.load_metadata_from_safetensors(file_name) else: sd = torch.load(file_name, map_location="cpu") + metadata = {} + for key in list(sd.keys()): if type(sd[key]) == torch.Tensor: sd[key] = sd[key].to(dtype) - return sd + + return sd, metadata -def save_to_file(file_name, model, state_dict, dtype): +def save_to_file(file_name, model, state_dict, dtype, metadata): if dtype is not None: for key in list(state_dict.keys()): if type(state_dict[key]) == torch.Tensor: state_dict[key] = state_dict[key].to(dtype) if os.path.splitext(file_name)[1] == ".safetensors": - save_file(model, file_name) + save_file(model, file_name, metadata=metadata) else: torch.save(model, file_name) @@ -56,7 +62,7 @@ def merge_to_sd_model(text_encoder, unet, models, ratios, merge_dtype): for model, ratio in zip(models, ratios): print(f"loading: {model}") - lora_sd = load_state_dict(model, merge_dtype) + lora_sd, _ = load_state_dict(model, merge_dtype) print(f"merging...") for key in lora_sd.keys(): @@ -81,9 +87,11 @@ def merge_to_sd_model(text_encoder, unet, models, ratios, merge_dtype): # W <- W + U * D weight = module.weight - # print(module_name, down_weight.size(), up_weight.size()) if len(weight.size()) == 2: # linear + if len(up_weight.size()) == 4: # use linear projection mismatch + up_weight = up_weight.squeeze(3).squeeze(2) + down_weight = down_weight.squeeze(3).squeeze(2) weight = weight + ratio * (up_weight @ down_weight) * scale elif down_weight.size()[2:4] == (1, 1): # conv2d 1x1 @@ -107,9 +115,17 @@ def merge_lora_models(models, ratios, merge_dtype): base_dims = {} merged_sd = {} + v2 = None + base_model = None for model, ratio in zip(models, ratios): print(f"loading: {model}") - lora_sd = load_state_dict(model, merge_dtype) + lora_sd, lora_metadata = load_state_dict(model, merge_dtype) + + if lora_metadata is not None: + if v2 is None: + v2 = lora_metadata.get(train_util.SS_METADATA_KEY_V2, None) # return string + if base_model is None: + base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None) # get alpha and dim alphas = {} # alpha for current model @@ -166,7 +182,26 @@ def merge_lora_models(models, ratios, merge_dtype): print("merged model") print(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}") - return merged_sd + # check all dims are same + dims_list = list(set(base_dims.values())) + alphas_list = list(set(base_alphas.values())) + all_same_dims = True + all_same_alphas = True + for dims in dims_list: + if dims != dims_list[0]: + all_same_dims = False + break + for alphas in alphas_list: + if alphas != alphas_list[0]: + all_same_alphas = False + break + + # build minimum metadata + dims = f"{dims_list[0]}" if all_same_dims else "Dynamic" + alphas = f"{alphas_list[0]}" if all_same_alphas else "Dynamic" + metadata = train_util.build_minimum_network_metadata(v2, base_model, "networks.lora", dims, alphas, None) + + return merged_sd, metadata, v2 == "True" def merge(args): @@ -193,13 +228,57 @@ def merge(args): merge_to_sd_model(text_encoder, unet, args.models, args.ratios, merge_dtype) + if args.no_metadata: + sai_metadata = None + else: + merged_from = sai_model_spec.build_merged_from([args.sd_model] + args.models) + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + None, + args.v2, + args.v2, + False, + False, + False, + time.time(), + title=title, + merged_from=merged_from, + is_stable_diffusion_ckpt=True, + ) + if args.v2: + # TODO read sai modelspec + print( + "Cannot determine if model is for v-prediction, so save metadata as v-prediction / modelがv-prediction用か否か不明なため、仮にv-prediction用としてmetadataを保存します" + ) + print(f"saving SD model to: {args.save_to}") - model_util.save_stable_diffusion_checkpoint(args.v2, args.save_to, text_encoder, unet, args.sd_model, 0, 0, save_dtype, vae) + model_util.save_stable_diffusion_checkpoint( + args.v2, args.save_to, text_encoder, unet, args.sd_model, 0, 0, sai_metadata, save_dtype, vae + ) else: - state_dict = merge_lora_models(args.models, args.ratios, merge_dtype) + state_dict, metadata, v2 = merge_lora_models(args.models, args.ratios, merge_dtype) + + print(f"calculating hashes and creating metadata...") + + model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata) + metadata["sshs_model_hash"] = model_hash + metadata["sshs_legacy_hash"] = legacy_hash + + if not args.no_metadata: + merged_from = sai_model_spec.build_merged_from(args.models) + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + state_dict, v2, v2, False, True, False, time.time(), title=title, merged_from=merged_from + ) + if v2: + # TODO read sai modelspec + print( + "Cannot determine if LoRA is for v-prediction, so save metadata as v-prediction / LoRAがv-prediction用か否か不明なため、仮にv-prediction用としてmetadataを保存します" + ) + metadata.update(sai_metadata) print(f"saving model to: {args.save_to}") - save_to_file(args.save_to, state_dict, state_dict, save_dtype) + save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata) def setup_parser() -> argparse.ArgumentParser: @@ -232,6 +311,12 @@ def setup_parser() -> argparse.ArgumentParser: "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors" ) parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率") + parser.add_argument( + "--no_metadata", + action="store_true", + help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / " + + "sai modelspecのメタデータを保存しない(LoRAの最低限のss_metadataは保存される)", + ) return parser diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py index a91b62d8..0608c01f 100644 --- a/networks/sdxl_merge_lora.py +++ b/networks/sdxl_merge_lora.py @@ -1,10 +1,11 @@ import math import argparse import os +import time import torch from safetensors.torch import load_file, save_file from tqdm import tqdm -from library import sdxl_model_util +from library import sai_model_spec, sdxl_model_util, train_util import library.model_util as model_util import lora @@ -12,22 +13,26 @@ import lora def load_state_dict(file_name, dtype): if os.path.splitext(file_name)[1] == ".safetensors": sd = load_file(file_name) + metadata = train_util.load_metadata_from_safetensors(file_name) else: sd = torch.load(file_name, map_location="cpu") + metadata = {} + for key in list(sd.keys()): if type(sd[key]) == torch.Tensor: sd[key] = sd[key].to(dtype) - return sd + + return sd, metadata -def save_to_file(file_name, model, state_dict, dtype): +def save_to_file(file_name, model, state_dict, dtype, metadata): if dtype is not None: for key in list(state_dict.keys()): if type(state_dict[key]) == torch.Tensor: state_dict[key] = state_dict[key].to(dtype) if os.path.splitext(file_name)[1] == ".safetensors": - save_file(model, file_name) + save_file(model, file_name, metadata=metadata) else: torch.save(model, file_name) @@ -62,7 +67,7 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_ for model, ratio in zip(models, ratios): print(f"loading: {model}") - lora_sd = load_state_dict(model, merge_dtype) + lora_sd, _ = load_state_dict(model, merge_dtype) print(f"merging...") for key in tqdm(lora_sd.keys()): @@ -113,9 +118,17 @@ def merge_lora_models(models, ratios, merge_dtype): base_dims = {} merged_sd = {} + v2 = None + base_model = None for model, ratio in zip(models, ratios): print(f"loading: {model}") - lora_sd = load_state_dict(model, merge_dtype) + lora_sd, lora_metadata = load_state_dict(model, merge_dtype) + + if lora_metadata is not None: + if v2 is None: + v2 = lora_metadata.get(train_util.SS_METADATA_KEY_V2, None) # returns string, SDXLはv2がないのでFalseのはず + if base_model is None: + base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None) # get alpha and dim alphas = {} # alpha for current model @@ -172,7 +185,26 @@ def merge_lora_models(models, ratios, merge_dtype): print("merged model") print(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}") - return merged_sd + # check all dims are same + dims_list = list(set(base_dims.values())) + alphas_list = list(set(base_alphas.values())) + all_same_dims = True + all_same_alphas = True + for dims in dims_list: + if dims != dims_list[0]: + all_same_dims = False + break + for alphas in alphas_list: + if alphas != alphas_list[0]: + all_same_alphas = False + break + + # build minimum metadata + dims = f"{dims_list[0]}" if all_same_dims else "Dynamic" + alphas = f"{alphas_list[0]}" if all_same_alphas else "Dynamic" + metadata = train_util.build_minimum_network_metadata(v2, base_model, "networks.lora", dims, alphas, None) + + return merged_sd, metadata def merge(args): @@ -202,19 +234,42 @@ def merge(args): unet, logit_scale, ckpt_info, - ) = sdxl_model_util.load_models_from_sdxl_checkpoint(sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, args.sd_model, "cpu") + ) = sdxl_model_util.load_models_from_sdxl_checkpoint(sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, args.sd_model, "cpu") merge_to_sd_model(text_model1, text_model2, unet, args.models, args.ratios, merge_dtype) + if args.no_metadata: + sai_metadata = None + else: + merged_from = sai_model_spec.build_merged_from([args.sd_model] + args.models) + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + None, False, False, True, False, False, time.time(), title=title, merged_from=merged_from + ) + print(f"saving SD model to: {args.save_to}") sdxl_model_util.save_stable_diffusion_checkpoint( - args.save_to, text_model1, text_model2, unet, 0, 0, ckpt_info, vae, logit_scale, save_dtype + args.save_to, text_model1, text_model2, unet, 0, 0, ckpt_info, vae, logit_scale, sai_metadata, save_dtype ) else: - state_dict = merge_lora_models(args.models, args.ratios, merge_dtype) + state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype) + + print(f"calculating hashes and creating metadata...") + + model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata) + metadata["sshs_model_hash"] = model_hash + metadata["sshs_legacy_hash"] = legacy_hash + + if not args.no_metadata: + merged_from = sai_model_spec.build_merged_from(args.models) + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + state_dict, False, False, True, True, False, time.time(), title=title, merged_from=merged_from + ) + metadata.update(sai_metadata) print(f"saving model to: {args.save_to}") - save_to_file(args.save_to, state_dict, state_dict, save_dtype) + save_to_file(args.save_to, state_dict, state_dict, save_dtype, metadata) def setup_parser() -> argparse.ArgumentParser: @@ -246,6 +301,12 @@ def setup_parser() -> argparse.ArgumentParser: "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors" ) parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率") + parser.add_argument( + "--no_metadata", + action="store_true", + help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / " + + "sai modelspecのメタデータを保存しない(LoRAの最低限のss_metadataは保存される)", + ) return parser diff --git a/networks/svd_merge_lora.py b/networks/svd_merge_lora.py index 24359aa5..16e813b3 100644 --- a/networks/svd_merge_lora.py +++ b/networks/svd_merge_lora.py @@ -1,9 +1,11 @@ import math import argparse import os +import time import torch from safetensors.torch import load_file, save_file from tqdm import tqdm +from library import sai_model_spec, train_util import library.model_util as model_util import lora @@ -14,22 +16,26 @@ CLAMP_QUANTILE = 0.99 def load_state_dict(file_name, dtype): if os.path.splitext(file_name)[1] == ".safetensors": sd = load_file(file_name) + metadata = train_util.load_metadata_from_safetensors(file_name) else: sd = torch.load(file_name, map_location="cpu") + metadata = {} + for key in list(sd.keys()): if type(sd[key]) == torch.Tensor: sd[key] = sd[key].to(dtype) - return sd + + return sd, metadata -def save_to_file(file_name, state_dict, dtype): +def save_to_file(file_name, state_dict, dtype, metadata): if dtype is not None: for key in list(state_dict.keys()): if type(state_dict[key]) == torch.Tensor: state_dict[key] = state_dict[key].to(dtype) if os.path.splitext(file_name)[1] == ".safetensors": - save_file(state_dict, file_name) + save_file(state_dict, file_name, metadata=metadata) else: torch.save(state_dict, file_name) @@ -37,9 +43,17 @@ def save_to_file(file_name, state_dict, dtype): def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dtype): print(f"new rank: {new_rank}, new conv rank: {new_conv_rank}") merged_sd = {} + v2 = None + base_model = None for model, ratio in zip(models, ratios): print(f"loading: {model}") - lora_sd = load_state_dict(model, merge_dtype) + lora_sd, lora_metadata = load_state_dict(model, merge_dtype) + + if lora_metadata is not None: + if v2 is None: + v2 = lora_metadata.get(train_util.SS_METADATA_KEY_V2, None) # return string + if base_model is None: + base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None) # merge print(f"merging...") @@ -140,7 +154,16 @@ def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dty merged_lora_sd[lora_module_name + ".lora_down.weight"] = down_weight.to("cpu").contiguous() merged_lora_sd[lora_module_name + ".alpha"] = torch.tensor(module_new_rank) - return merged_lora_sd + # build minimum metadata + dims = f"{new_rank}" + alphas = f"{new_rank}" + if new_conv_rank is not None: + network_args = {"conv_dim": new_conv_rank, "conv_alpha": new_conv_rank} + else: + network_args = None + metadata = train_util.build_minimum_network_metadata(v2, base_model, "networks.lora", dims, alphas, network_args) + + return merged_lora_sd, metadata, v2 == "True", base_model def merge(args): @@ -161,10 +184,32 @@ def merge(args): save_dtype = merge_dtype new_conv_rank = args.new_conv_rank if args.new_conv_rank is not None else args.new_rank - state_dict = merge_lora_models(args.models, args.ratios, args.new_rank, new_conv_rank, args.device, merge_dtype) + state_dict, metadata, v2, base_model = merge_lora_models( + args.models, args.ratios, args.new_rank, new_conv_rank, args.device, merge_dtype + ) + + print(f"calculating hashes and creating metadata...") + + model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata) + metadata["sshs_model_hash"] = model_hash + metadata["sshs_legacy_hash"] = legacy_hash + + if not args.no_metadata: + is_sdxl = base_model is not None and base_model.lower().startswith("sdxl") + merged_from = sai_model_spec.build_merged_from(args.models) + title = os.path.splitext(os.path.basename(args.save_to))[0] + sai_metadata = sai_model_spec.build_metadata( + state_dict, v2, v2, is_sdxl, True, False, time.time(), title=title, merged_from=merged_from + ) + if v2: + # TODO read sai modelspec + print( + "Cannot determine if LoRA is for v-prediction, so save metadata as v-prediction / LoRAがv-prediction用か否か不明なため、仮にv-prediction用としてmetadataを保存します" + ) + metadata.update(sai_metadata) print(f"saving model to: {args.save_to}") - save_to_file(args.save_to, state_dict, save_dtype) + save_to_file(args.save_to, state_dict, save_dtype, metadata) def setup_parser() -> argparse.ArgumentParser: @@ -198,6 +243,12 @@ def setup_parser() -> argparse.ArgumentParser: help="Specify rank of output LoRA for Conv2d 3x3, None for same as new_rank / 出力するConv2D 3x3 LoRAのrank (dim)、Noneでnew_rankと同じ", ) parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う") + parser.add_argument( + "--no_metadata", + action="store_true", + help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / " + + "sai modelspecのメタデータを保存しない(LoRAの最低限のss_metadataは保存される)", + ) return parser diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py index 68f0e5db..209e71a7 100644 --- a/sdxl_gen_img.py +++ b/sdxl_gen_img.py @@ -1294,7 +1294,7 @@ def main(args): args.ckpt = files[0] (_, text_encoder1, text_encoder2, vae, unet, _, _) = sdxl_train_util._load_target_model( - args.ckpt, args.vae, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, dtype + args.ckpt, args.vae, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, dtype ) # xformers、Hypernetwork対応 diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py index 72ffe97f..5c8a0bd8 100644 --- a/sdxl_minimal_inference.py +++ b/sdxl_minimal_inference.py @@ -112,7 +112,7 @@ if __name__ == "__main__": # 本体RAMが少ない場合はGPUにロードするといいかも # If the main RAM is small, it may be better to load it on the GPU text_model1, text_model2, vae, unet, _, _ = sdxl_model_util.load_models_from_sdxl_checkpoint( - sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, args.ckpt_path, "cpu" + sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, args.ckpt_path, "cpu" ) # Text Encoder 1はSDXL本体でもHuggingFaceのものを使っている diff --git a/sdxl_train.py b/sdxl_train.py index b57e2f5c..2ca14931 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -151,7 +151,7 @@ def train(args): else: save_stable_diffusion_format = args.save_model_as.lower() == "ckpt" or args.save_model_as.lower() == "safetensors" use_safetensors = args.use_safetensors or ("safetensors" in args.save_model_as.lower()) - assert save_stable_diffusion_format, "save_model_as must be ckpt or safetensors / save_model_asはckptかsafetensorsである必要があります" + # assert save_stable_diffusion_format, "save_model_as must be ckpt or safetensors / save_model_asはckptかsafetensorsである必要があります" # Diffusers版のxformers使用フラグを設定する関数 def set_diffusers_xformers_flag(model, valid): diff --git a/sdxl_train_network.py b/sdxl_train_network.py index dc222534..e3254be0 100644 --- a/sdxl_train_network.py +++ b/sdxl_train_network.py @@ -8,6 +8,7 @@ class SdxlNetworkTrainer(train_network.NetworkTrainer): def __init__(self): super().__init__() self.vae_scale_factor = sdxl_model_util.VAE_SCALE_FACTOR + self.is_sdxl = True def assert_extra_args(self, args, train_dataset_group): super().assert_extra_args(args, train_dataset_group) @@ -31,13 +32,13 @@ class SdxlNetworkTrainer(train_network.NetworkTrainer): unet, logit_scale, ckpt_info, - ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, weight_dtype) + ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype) self.load_stable_diffusion_format = load_stable_diffusion_format self.logit_scale = logit_scale self.ckpt_info = ckpt_info - return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, [text_encoder1, text_encoder2], vae, unet + return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, [text_encoder1, text_encoder2], vae, unet def load_tokenizer(self, args): tokenizer = sdxl_train_util.load_tokenizers(args) @@ -134,7 +135,6 @@ class SdxlNetworkTrainer(train_network.NetworkTrainer): # assert ((pool2.to("cpu") - p2.to(dtype=weight_dtype)).abs().max() > 1e-2).sum() <= b_size * 2 # print("text encoder outputs verified") - return encoder_hidden_states1, encoder_hidden_states2, pool2 def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype): diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py index a2515051..1ddfd92b 100644 --- a/sdxl_train_textual_inversion.py +++ b/sdxl_train_textual_inversion.py @@ -13,6 +13,7 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine def __init__(self): super().__init__() self.vae_scale_factor = sdxl_model_util.VAE_SCALE_FACTOR + self.is_sdxl = True def assert_extra_args(self, args, train_dataset_group): super().assert_extra_args(args, train_dataset_group) @@ -27,13 +28,13 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine unet, logit_scale, ckpt_info, - ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, weight_dtype) + ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype) self.load_stable_diffusion_format = load_stable_diffusion_format self.logit_scale = logit_scale self.ckpt_info = ckpt_info - return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V0_9, [text_encoder1, text_encoder2], vae, unet + return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, [text_encoder1, text_encoder2], vae, unet def load_tokenizer(self, args): tokenizer = sdxl_train_util.load_tokenizers(args) @@ -79,7 +80,7 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement ) - def save_weights(self, file, updated_embs, save_dtype): + def save_weights(self, file, updated_embs, save_dtype, metadata): state_dict = {"clip_l": updated_embs[0], "clip_g": updated_embs[1]} if save_dtype is not None: @@ -91,7 +92,7 @@ class SdxlTextualInversionTrainer(train_textual_inversion.TextualInversionTraine if os.path.splitext(file)[1] == ".safetensors": from safetensors.torch import save_file - save_file(state_dict, file) + save_file(state_dict, file, metadata) else: torch.save(state_dict, file) diff --git a/tools/show_metadata.py b/tools/show_metadata.py new file mode 100644 index 00000000..92ca7b1c --- /dev/null +++ b/tools/show_metadata.py @@ -0,0 +1,19 @@ +import json +import argparse +from safetensors import safe_open + +parser = argparse.ArgumentParser() +parser.add_argument("--model", type=str, required=True) +args = parser.parse_args() + +with safe_open(args.model, framework="pt") as f: + metadata = f.metadata() + +if metadata is None: + print("No metadata found") +else: + # metadata is json dict, but not pretty printed + # sort by key and pretty print + print(json.dumps(metadata, indent=4, sort_keys=True)) + + \ No newline at end of file diff --git a/train_db.py b/train_db.py index 72d634b8..6dde7e9b 100644 --- a/train_db.py +++ b/train_db.py @@ -156,10 +156,11 @@ def train(args): # 学習に必要なクラスを準備する accelerator.print("prepare optimizer, data loader etc.") if train_text_encoder: - trainable_params = itertools.chain(unet.parameters(), text_encoder.parameters()) + # wightout list, adamw8bit is crashed + trainable_params = list(itertools.chain(unet.parameters(), text_encoder.parameters())) else: trainable_params = unet.parameters() - + _, _, optimizer = train_util.get_optimizer(args, trainable_params) # dataloaderを準備する diff --git a/train_network.py b/train_network.py index e296d72b..f752607e 100644 --- a/train_network.py +++ b/train_network.py @@ -39,6 +39,7 @@ from library.custom_train_functions import ( class NetworkTrainer: def __init__(self): self.vae_scale_factor = 0.18215 + self.is_sdxl = False # TODO 他のスクリプトと共通化する def generate_step_logs( @@ -217,7 +218,7 @@ class NetworkTrainer: # モデルに xformers とか memory efficient attention を組み込む train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa) - if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える + if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える vae.set_use_memory_efficient_attention_xformers(args.xformers) # 差分追加学習のためにモデルを読み込む @@ -401,7 +402,7 @@ class NetworkTrainer: ) text_encoders = [text_encoder] - unet.to(accelerator.device, dtype=weight_dtype) # move to device because unet is not prepared by accelerator + unet.to(accelerator.device, dtype=weight_dtype) # move to device because unet is not prepared by accelerator else: network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( network, optimizer, train_dataloader, lr_scheduler @@ -660,16 +661,8 @@ class NetworkTrainer: metadata = {k: str(v) for k, v in metadata.items()} # make minimum metadata for filtering - minimum_keys = [ - "ss_v2", - "ss_base_model_version", - "ss_network_module", - "ss_network_dim", - "ss_network_alpha", - "ss_network_args", - ] minimum_metadata = {} - for key in minimum_keys: + for key in train_util.SS_METADATA_MINIMUM_KEYS: if key in metadata: minimum_metadata[key] = metadata[key] @@ -687,7 +680,9 @@ class NetworkTrainer: init_kwargs = {} if args.log_tracker_config is not None: init_kwargs = toml.load(args.log_tracker_config) - accelerator.init_trackers("network_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs) + accelerator.init_trackers( + "network_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs + ) loss_list = [] loss_total = 0.0 @@ -709,7 +704,11 @@ class NetworkTrainer: metadata["ss_steps"] = str(steps) metadata["ss_epoch"] = str(epoch_no) - unwrapped_nw.save_weights(ckpt_file, save_dtype, minimum_metadata if args.no_metadata else metadata) + metadata_to_save = minimum_metadata if args.no_metadata else metadata + sai_metadata = train_util.get_sai_model_spec(None, args, self.is_sdxl, True, False) + metadata_to_save.update(sai_metadata) + + unwrapped_nw.save_weights(ckpt_file, save_dtype, metadata_to_save) if args.huggingface_repo_id is not None: huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload) diff --git a/train_textual_inversion.py b/train_textual_inversion.py index 300afa3e..b65d524c 100644 --- a/train_textual_inversion.py +++ b/train_textual_inversion.py @@ -83,6 +83,7 @@ imagenet_style_templates_small = [ class TextualInversionTrainer: def __init__(self): self.vae_scale_factor = 0.18215 + self.is_sdxl = False def assert_extra_args(self, args, train_dataset_group): pass @@ -113,7 +114,7 @@ class TextualInversionTrainer: accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement ) - def save_weights(self, file, updated_embs, save_dtype): + def save_weights(self, file, updated_embs, save_dtype, metadata): state_dict = {"emb_params": updated_embs[0]} if save_dtype is not None: @@ -125,7 +126,7 @@ class TextualInversionTrainer: if os.path.splitext(file)[1] == ".safetensors": from safetensors.torch import save_file - save_file(state_dict, file) + save_file(state_dict, file, metadata) else: torch.save(state_dict, file) # can be loaded in Web UI @@ -345,7 +346,7 @@ class TextualInversionTrainer: # モデルに xformers とか memory efficient attention を組み込む train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa) - if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える + if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える vae.set_use_memory_efficient_attention_xformers(args.xformers) # 学習を準備する @@ -497,7 +498,9 @@ class TextualInversionTrainer: init_kwargs = {} if args.log_tracker_config is not None: init_kwargs = toml.load(args.log_tracker_config) - accelerator.init_trackers("textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs) + accelerator.init_trackers( + "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs + ) # function for saving/removing def save_model(ckpt_name, embs_list, steps, epoch_no, force_sync_upload=False): @@ -505,7 +508,10 @@ class TextualInversionTrainer: ckpt_file = os.path.join(args.output_dir, ckpt_name) accelerator.print(f"\nsaving checkpoint: {ckpt_file}") - self.save_weights(ckpt_file, embs_list, save_dtype) + + sai_metadata = train_util.get_sai_model_spec(None, args, self.is_sdxl, False, True) + + self.save_weights(ckpt_file, embs_list, save_dtype, sai_metadata) if args.huggingface_repo_id is not None: huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)