fix full_fp16 compatible and train_step

This commit is contained in:
BootsofLagrangian
2024-02-07 16:42:05 +09:00
parent 7d2a9268b9
commit 62556619bd
6 changed files with 121 additions and 40 deletions

View File

@@ -221,18 +221,10 @@ def train(args):
# 学習ステップ数を計算する
if args.max_train_epochs is not None:
if args.deepspeed:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps
)
accelerator.print(
f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
)
else:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
# データセット側にも学習ステップを送信
train_dataset_group.set_max_train_steps(args.max_train_steps)

View File

@@ -3166,6 +3166,11 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
action="store_true",
help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."
)
parser.add_argument(
"--fp16_master_weights_and_gradients",
action="store_true",
help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."
)
def verify_training_args(args: argparse.Namespace):
if args.v_parameterization and not args.v2:
@@ -3966,6 +3971,8 @@ def prepare_accelerator(args: argparse.Namespace):
deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
deepspeed_plugin.deepspeed_config['train_batch_size'] = \
args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
if args.full_fp16 or args.fp16_master_weights_and_gradients:
deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True
accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,

View File

@@ -437,7 +437,8 @@ def train(args):
text_encoder2.to(accelerator.device)
# 実験的機能勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
if args.full_fp16:
if args.full_fp16 and not args.deepspeed:
# During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
train_util.patch_accelerator_for_fp16_training(accelerator)
# resumeする

96
test_pip_requirements.txt Normal file
View File

@@ -0,0 +1,96 @@
absl-py==2.1.0
accelerate==0.25.0
aiohttp==3.9.3
aiosignal==1.3.1
altair==4.2.2
annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1696634205638/work
async-timeout==4.0.3
attrs==23.2.0
bitsandbytes==0.42.0
Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1695989787169/work
cachetools==5.3.2
certifi==2022.12.7
charset-normalizer==2.1.1
cmake==3.25.0
deepspeed==0.13.1
diffusers==0.25.0
easygui==0.98.3
einops==0.6.1
entrypoints==0.4
filelock==3.9.0
frozenlist==1.4.1
fsspec==2024.2.0
ftfy==6.1.1
gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1666808654411/work
google-auth==2.27.0
google-auth-oauthlib==0.4.6
grpcio==1.60.1
hjson==3.1.0
huggingface-hub==0.20.1
idna==3.4
importlib-metadata==7.0.1
Jinja2==3.1.2
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
-e git+https://github.com/kohya-ss/sd-scripts@cd19df49cd512e13ac90db115c424d19c0e8868a#egg=library
lightning-utilities==0.10.1
lit==15.0.7
Markdown==3.5.2
MarkupSafe==2.1.3
mpmath==1.3.0
multidict==6.0.5
networkx==3.2.1
ninja==1.11.1.1
numpy==1.26.3
oauthlib==3.2.2
open-clip-torch==2.20.0
opencv-python==4.7.0.68
packaging==23.2
pandas==2.2.0
pillow==10.2.0
protobuf==3.19.6
psutil==5.9.8
py-cpuinfo @ file:///home/conda/feedstock_root/build_artifacts/py-cpuinfo_1666774466606/work
pyasn1==0.5.1
pyasn1-modules==0.3.0
pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1706543943340/work
pydantic_core @ file:///home/conda/feedstock_root/build_artifacts/pydantic-core_1705674688239/work
pynvml==11.5.0
PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
python-dateutil==2.8.2
pytorch-lightning==1.9.0
pytz==2024.1
PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work
referencing==0.33.0
regex==2023.12.25
requests==2.28.1
requests-oauthlib==1.3.1
rpds-py==0.17.1
rsa==4.9
safetensors==0.4.2
scipy==1.12.0
sentencepiece==0.1.99
six==1.16.0
sympy==1.12
tensorboard==2.10.1
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
timm==0.9.12
tokenizers==0.15.1
toml==0.10.2
toolz==0.12.1
torch==2.0.1+cu118
torchaudio==2.2.0
torchmetrics==1.3.0.post0
torchvision==0.15.2+cu118
tqdm==4.66.1
transformers==4.36.2
triton==2.0.0
typing_extensions==4.8.0
tzdata==2023.4
urllib3==1.26.13
voluptuous==0.13.1
wcwidth==0.2.13
Werkzeug==3.0.1
yarl==1.9.4
zipp==3.17.0

View File

@@ -190,18 +190,10 @@ def train(args):
# 学習ステップ数を計算する
if args.max_train_epochs is not None:
if args.deepspeed:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps
)
accelerator.print(
f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
)
else:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
# データセット側にも学習ステップを送信
train_dataset_group.set_max_train_steps(args.max_train_steps)

View File

@@ -359,20 +359,12 @@ class NetworkTrainer:
# 学習ステップ数を計算する
if args.max_train_epochs is not None:
if args.deepspeed:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / args.gradient_accumulation_steps
)
accelerator.print(
f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
)
else:
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(
f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
)
args.max_train_steps = args.max_train_epochs * math.ceil(
len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
)
accelerator.print(
f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
)
# データセット側にも学習ステップを送信
train_dataset_group.set_max_train_steps(args.max_train_steps)
@@ -479,7 +471,8 @@ class NetworkTrainer:
vae.to(accelerator.device, dtype=vae_dtype)
# 実験的機能勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
if args.full_fp16:
if args.full_fp16 and not args.deepspeed:
# During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
train_util.patch_accelerator_for_fp16_training(accelerator)
# resumeする