♻️ crazy refactor

2026-03-11 22:52:01 +01:00
parent 35223b3560
commit 4115447022
34 changed files with 4255 additions and 102 deletions
--- a/configs/env/rotary_cartpole.yaml
+++ b/configs/env/rotary_cartpole.yaml
@@ -1,3 +1,10 @@
 max_steps: 1000
 robot_path: assets/rotary_cartpole
-reward_upright_scale: 1.0
+reward_upright_scale: 1.0
+speed_penalty_scale: 0.1
+
+# ── HPO search ranges ────────────────────────────────────────────────
+hpo:
+  reward_upright_scale: {min: 0.5, max: 5.0}
+  speed_penalty_scale: {min: 0.01, max: 1.0}
+  max_steps: {values: [500, 1000, 2000]}
--- a/configs/runner/mujoco_single.yaml
+++ b/configs/runner/mujoco_single.yaml
@@ -0,0 +1,7 @@
+# Single-env MuJoCo runner — mimics real hardware timing.
+# dt × substeps = 0.002 × 10 = 0.02 s → 50 Hz control, same as serial runner.
+
+num_envs: 1
+device: cpu
+dt: 0.002
+substeps: 10
--- a/configs/runner/serial.yaml
+++ b/configs/runner/serial.yaml
@@ -0,0 +1,11 @@
+# Serial runner — communicates with real hardware over USB/serial.
+# Always single-env, CPU-only.  Override port on CLI:
+#   python train.py runner=serial runner.port=/dev/ttyUSB0
+
+num_envs: 1
+device: cpu
+port: /dev/cu.usbserial-0001
+baud: 115200
+dt: 0.02                    # control loop period (50 Hz)
+no_data_timeout: 2.0        # seconds of silence before declaring disconnect
+encoder_jump_threshold: 200  # encoder tick jump → reboot detection
--- a/configs/sysid.yaml
+++ b/configs/sysid.yaml
@@ -0,0 +1,25 @@
+# System identification defaults.
+# Override via CLI: python -m src.sysid.optimize sysid.max_generations=50
+#
+# These are NOT Hydra config groups — the sysid scripts use argparse.
+# This file serves as documentation and can be loaded by custom wrappers.
+
+capture:
+  port: /dev/cu.usbserial-0001
+  baud: 115200
+  duration: 20.0          # seconds
+  amplitude: 180           # max PWM magnitude (0–255)
+  hold_min_ms: 50          # PRBS min hold time
+  hold_max_ms: 300         # PRBS max hold time
+  dt: 0.02                 # sample period (50 Hz)
+
+optimize:
+  sigma0: 0.3              # CMA-ES initial step size (in [0,1] normalised space)
+  population_size: 20      # candidates per generation
+  max_generations: 200     # total generations (~4000 evaluations)
+  sim_dt: 0.002            # MuJoCo physics timestep
+  substeps: 10             # physics substeps per control step (ctrl_dt = 0.02s)
+  pos_weight: 1.0          # MSE weight for angle errors
+  vel_weight: 0.1          # MSE weight for velocity errors
+  window_duration: 0.5     # multiple-shooting window length (s); 0 = open-loop
+  seed: 42
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -12,5 +12,23 @@ entropy_loss_scale: 0.05
 log_interval: 1000
 checkpoint_interval: 50000

+initial_log_std: 0.5
+min_log_std: -2.0
+max_log_std: 2.0
+
+record_video_every: 10000
+
 # ClearML remote execution (GPU worker)
 remote: false
+
+# ── HPO search ranges ────────────────────────────────────────────────
+# Read by scripts/hpo.py — ignored by TrainerConfig during training.
+hpo:
+  learning_rate: {min: 0.00005, max: 0.001}
+  clip_ratio: {min: 0.1, max: 0.3}
+  discount_factor: {min: 0.98, max: 0.999}
+  gae_lambda: {min: 0.9, max: 0.99}
+  entropy_loss_scale: {min: 0.0001, max: 0.1}
+  value_loss_scale: {min: 0.1, max: 1.0}
+  learning_epochs: {min: 2, max: 8, type: int}
+  mini_batches: {values: [2, 4, 8, 16]}
--- a/configs/training/ppo_mjx.yaml
+++ b/configs/training/ppo_mjx.yaml
@@ -1,22 +1,18 @@
 # PPO tuned for MJX (1024+ parallel envs on GPU).
+# Inherits defaults + HPO ranges from ppo.yaml.
 # With 1024 envs, each timestep collects 1024 samples, so total_timesteps
 # can be much lower than the CPU config.

-hidden_sizes: [128, 128]
-total_timesteps: 300000       # 300K × 1024 envs ≈ 307M env steps
-rollout_steps: 1024           # PPO batch = 1024 envs × 1024 steps = 1M samples
-learning_epochs: 4
-mini_batches: 32              # keep mini-batch size similar to CPU config (~32K)
-discount_factor: 0.99
-gae_lambda: 0.95
-learning_rate: 0.001          # ~3x higher LR for 16x larger batch (sqrt scaling)
-clip_ratio: 0.2
-value_loss_scale: 0.5
-entropy_loss_scale: 0.05
-log_interval: 100             # log more often (shorter run)
+defaults:
+  - ppo
+  - _self_
+
+total_timesteps: 300000         # 300K × 1024 envs ≈ 307M env steps
+mini_batches: 32                # keep mini-batch size similar (~32K)
+learning_rate: 0.001            # ~3x higher LR for 16x larger batch (sqrt scaling)
+log_interval: 100
 checkpoint_interval: 10000

 record_video_every: 10000

-# ClearML remote execution (GPU worker)
 remote: false
--- a/configs/training/ppo_real.yaml
+++ b/configs/training/ppo_real.yaml
@@ -0,0 +1,27 @@
+# PPO tuned for single-env real-time training on real hardware.
+# Inherits defaults + HPO ranges from ppo.yaml.
+# ~50 Hz control × 1 env = ~50 timesteps/s.
+# 100k timesteps ≈ 33 minutes of wall-clock training.
+
+defaults:
+  - ppo
+  - _self_
+
+hidden_sizes: [256, 256]
+total_timesteps: 100000
+learning_epochs: 5
+learning_rate: 0.001            # conservative — can't undo real-world damage
+entropy_loss_scale: 0.0001
+log_interval: 1024
+checkpoint_interval: 5000       # frequent saves — can't rewind real hardware
+initial_log_std: -0.5           # moderate initial exploration
+min_log_std: -4.0
+max_log_std: 0.0                # cap σ at 1.0
+
+# Never run real-hardware training remotely
+remote: false
+
+# Tighter HPO ranges for real hardware (override base ppo.yaml ranges)
+hpo:
+  entropy_loss_scale: {min: 0.00005, max: 0.001}
+  learning_rate: {min: 0.0003, max: 0.003}
--- a/configs/training/ppo_single.yaml
+++ b/configs/training/ppo_single.yaml
@@ -0,0 +1,23 @@
+# PPO tuned for single-env simulation — mimics real hardware training.
+# Inherits defaults + HPO ranges from ppo.yaml.
+# Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers.
+# Sim runs ~100× faster than real time, so we can afford more timesteps.
+
+defaults:
+  - ppo
+  - _self_
+
+hidden_sizes: [256, 256]
+total_timesteps: 500000
+learning_epochs: 5
+learning_rate: 0.001
+entropy_loss_scale: 0.0001
+log_interval: 1024
+checkpoint_interval: 10000
+initial_log_std: -0.5
+min_log_std: -4.0
+max_log_std: 0.0
+
+record_video_every: 50000
+
+remote: false