♻️ crazy refactor

This commit is contained in:
2026-03-11 22:52:01 +01:00
parent 35223b3560
commit 4115447022
34 changed files with 4255 additions and 102 deletions

View File

@@ -12,5 +12,23 @@ entropy_loss_scale: 0.05
log_interval: 1000
checkpoint_interval: 50000
initial_log_std: 0.5
min_log_std: -2.0
max_log_std: 2.0
record_video_every: 10000
# ClearML remote execution (GPU worker)
remote: false
# ── HPO search ranges ────────────────────────────────────────────────
# Read by scripts/hpo.py — ignored by TrainerConfig during training.
hpo:
learning_rate: {min: 0.00005, max: 0.001}
clip_ratio: {min: 0.1, max: 0.3}
discount_factor: {min: 0.98, max: 0.999}
gae_lambda: {min: 0.9, max: 0.99}
entropy_loss_scale: {min: 0.0001, max: 0.1}
value_loss_scale: {min: 0.1, max: 1.0}
learning_epochs: {min: 2, max: 8, type: int}
mini_batches: {values: [2, 4, 8, 16]}

View File

@@ -1,22 +1,18 @@
# PPO tuned for MJX (1024+ parallel envs on GPU).
# Inherits defaults + HPO ranges from ppo.yaml.
# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
# can be much lower than the CPU config.
hidden_sizes: [128, 128]
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
rollout_steps: 1024 # PPO batch = 1024 envs × 1024 steps = 1M samples
learning_epochs: 4
mini_batches: 32 # keep mini-batch size similar to CPU config (~32K)
discount_factor: 0.99
gae_lambda: 0.95
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
clip_ratio: 0.2
value_loss_scale: 0.5
entropy_loss_scale: 0.05
log_interval: 100 # log more often (shorter run)
defaults:
- ppo
- _self_
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
mini_batches: 32 # keep mini-batch size similar (~32K)
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
log_interval: 100
checkpoint_interval: 10000
record_video_every: 10000
# ClearML remote execution (GPU worker)
remote: false

View File

@@ -0,0 +1,27 @@
# PPO tuned for single-env real-time training on real hardware.
# Inherits defaults + HPO ranges from ppo.yaml.
# ~50 Hz control × 1 env = ~50 timesteps/s.
# 100k timesteps ≈ 33 minutes of wall-clock training.
defaults:
- ppo
- _self_
hidden_sizes: [256, 256]
total_timesteps: 100000
learning_epochs: 5
learning_rate: 0.001 # conservative — can't undo real-world damage
entropy_loss_scale: 0.0001
log_interval: 1024
checkpoint_interval: 5000 # frequent saves — can't rewind real hardware
initial_log_std: -0.5 # moderate initial exploration
min_log_std: -4.0
max_log_std: 0.0 # cap σ at 1.0
# Never run real-hardware training remotely
remote: false
# Tighter HPO ranges for real hardware (override base ppo.yaml ranges)
hpo:
entropy_loss_scale: {min: 0.00005, max: 0.001}
learning_rate: {min: 0.0003, max: 0.003}

View File

@@ -0,0 +1,23 @@
# PPO tuned for single-env simulation — mimics real hardware training.
# Inherits defaults + HPO ranges from ppo.yaml.
# Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers.
# Sim runs ~100× faster than real time, so we can afford more timesteps.
defaults:
- ppo
- _self_
hidden_sizes: [256, 256]
total_timesteps: 500000
learning_epochs: 5
learning_rate: 0.001
entropy_loss_scale: 0.0001
log_interval: 1024
checkpoint_interval: 10000
initial_log_std: -0.5
min_log_std: -4.0
max_log_std: 0.0
record_video_every: 50000
remote: false