♻️ crazy refactor
This commit is contained in:
9
configs/env/rotary_cartpole.yaml
vendored
9
configs/env/rotary_cartpole.yaml
vendored
@@ -1,3 +1,10 @@
|
||||
max_steps: 1000
|
||||
robot_path: assets/rotary_cartpole
|
||||
reward_upright_scale: 1.0
|
||||
reward_upright_scale: 1.0
|
||||
speed_penalty_scale: 0.1
|
||||
|
||||
# ── HPO search ranges ────────────────────────────────────────────────
|
||||
hpo:
|
||||
reward_upright_scale: {min: 0.5, max: 5.0}
|
||||
speed_penalty_scale: {min: 0.01, max: 1.0}
|
||||
max_steps: {values: [500, 1000, 2000]}
|
||||
7
configs/runner/mujoco_single.yaml
Normal file
7
configs/runner/mujoco_single.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
# Single-env MuJoCo runner — mimics real hardware timing.
|
||||
# dt × substeps = 0.002 × 10 = 0.02 s → 50 Hz control, same as serial runner.
|
||||
|
||||
num_envs: 1
|
||||
device: cpu
|
||||
dt: 0.002
|
||||
substeps: 10
|
||||
11
configs/runner/serial.yaml
Normal file
11
configs/runner/serial.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
# Serial runner — communicates with real hardware over USB/serial.
|
||||
# Always single-env, CPU-only. Override port on CLI:
|
||||
# python train.py runner=serial runner.port=/dev/ttyUSB0
|
||||
|
||||
num_envs: 1
|
||||
device: cpu
|
||||
port: /dev/cu.usbserial-0001
|
||||
baud: 115200
|
||||
dt: 0.02 # control loop period (50 Hz)
|
||||
no_data_timeout: 2.0 # seconds of silence before declaring disconnect
|
||||
encoder_jump_threshold: 200 # encoder tick jump → reboot detection
|
||||
25
configs/sysid.yaml
Normal file
25
configs/sysid.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# System identification defaults.
|
||||
# Override via CLI: python -m src.sysid.optimize sysid.max_generations=50
|
||||
#
|
||||
# These are NOT Hydra config groups — the sysid scripts use argparse.
|
||||
# This file serves as documentation and can be loaded by custom wrappers.
|
||||
|
||||
capture:
|
||||
port: /dev/cu.usbserial-0001
|
||||
baud: 115200
|
||||
duration: 20.0 # seconds
|
||||
amplitude: 180 # max PWM magnitude (0–255)
|
||||
hold_min_ms: 50 # PRBS min hold time
|
||||
hold_max_ms: 300 # PRBS max hold time
|
||||
dt: 0.02 # sample period (50 Hz)
|
||||
|
||||
optimize:
|
||||
sigma0: 0.3 # CMA-ES initial step size (in [0,1] normalised space)
|
||||
population_size: 20 # candidates per generation
|
||||
max_generations: 200 # total generations (~4000 evaluations)
|
||||
sim_dt: 0.002 # MuJoCo physics timestep
|
||||
substeps: 10 # physics substeps per control step (ctrl_dt = 0.02s)
|
||||
pos_weight: 1.0 # MSE weight for angle errors
|
||||
vel_weight: 0.1 # MSE weight for velocity errors
|
||||
window_duration: 0.5 # multiple-shooting window length (s); 0 = open-loop
|
||||
seed: 42
|
||||
@@ -12,5 +12,23 @@ entropy_loss_scale: 0.05
|
||||
log_interval: 1000
|
||||
checkpoint_interval: 50000
|
||||
|
||||
initial_log_std: 0.5
|
||||
min_log_std: -2.0
|
||||
max_log_std: 2.0
|
||||
|
||||
record_video_every: 10000
|
||||
|
||||
# ClearML remote execution (GPU worker)
|
||||
remote: false
|
||||
|
||||
# ── HPO search ranges ────────────────────────────────────────────────
|
||||
# Read by scripts/hpo.py — ignored by TrainerConfig during training.
|
||||
hpo:
|
||||
learning_rate: {min: 0.00005, max: 0.001}
|
||||
clip_ratio: {min: 0.1, max: 0.3}
|
||||
discount_factor: {min: 0.98, max: 0.999}
|
||||
gae_lambda: {min: 0.9, max: 0.99}
|
||||
entropy_loss_scale: {min: 0.0001, max: 0.1}
|
||||
value_loss_scale: {min: 0.1, max: 1.0}
|
||||
learning_epochs: {min: 2, max: 8, type: int}
|
||||
mini_batches: {values: [2, 4, 8, 16]}
|
||||
|
||||
@@ -1,22 +1,18 @@
|
||||
# PPO tuned for MJX (1024+ parallel envs on GPU).
|
||||
# Inherits defaults + HPO ranges from ppo.yaml.
|
||||
# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
|
||||
# can be much lower than the CPU config.
|
||||
|
||||
hidden_sizes: [128, 128]
|
||||
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
|
||||
rollout_steps: 1024 # PPO batch = 1024 envs × 1024 steps = 1M samples
|
||||
learning_epochs: 4
|
||||
mini_batches: 32 # keep mini-batch size similar to CPU config (~32K)
|
||||
discount_factor: 0.99
|
||||
gae_lambda: 0.95
|
||||
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
|
||||
clip_ratio: 0.2
|
||||
value_loss_scale: 0.5
|
||||
entropy_loss_scale: 0.05
|
||||
log_interval: 100 # log more often (shorter run)
|
||||
defaults:
|
||||
- ppo
|
||||
- _self_
|
||||
|
||||
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
|
||||
mini_batches: 32 # keep mini-batch size similar (~32K)
|
||||
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
|
||||
log_interval: 100
|
||||
checkpoint_interval: 10000
|
||||
|
||||
record_video_every: 10000
|
||||
|
||||
# ClearML remote execution (GPU worker)
|
||||
remote: false
|
||||
|
||||
27
configs/training/ppo_real.yaml
Normal file
27
configs/training/ppo_real.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
# PPO tuned for single-env real-time training on real hardware.
|
||||
# Inherits defaults + HPO ranges from ppo.yaml.
|
||||
# ~50 Hz control × 1 env = ~50 timesteps/s.
|
||||
# 100k timesteps ≈ 33 minutes of wall-clock training.
|
||||
|
||||
defaults:
|
||||
- ppo
|
||||
- _self_
|
||||
|
||||
hidden_sizes: [256, 256]
|
||||
total_timesteps: 100000
|
||||
learning_epochs: 5
|
||||
learning_rate: 0.001 # conservative — can't undo real-world damage
|
||||
entropy_loss_scale: 0.0001
|
||||
log_interval: 1024
|
||||
checkpoint_interval: 5000 # frequent saves — can't rewind real hardware
|
||||
initial_log_std: -0.5 # moderate initial exploration
|
||||
min_log_std: -4.0
|
||||
max_log_std: 0.0 # cap σ at 1.0
|
||||
|
||||
# Never run real-hardware training remotely
|
||||
remote: false
|
||||
|
||||
# Tighter HPO ranges for real hardware (override base ppo.yaml ranges)
|
||||
hpo:
|
||||
entropy_loss_scale: {min: 0.00005, max: 0.001}
|
||||
learning_rate: {min: 0.0003, max: 0.003}
|
||||
23
configs/training/ppo_single.yaml
Normal file
23
configs/training/ppo_single.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
# PPO tuned for single-env simulation — mimics real hardware training.
|
||||
# Inherits defaults + HPO ranges from ppo.yaml.
|
||||
# Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers.
|
||||
# Sim runs ~100× faster than real time, so we can afford more timesteps.
|
||||
|
||||
defaults:
|
||||
- ppo
|
||||
- _self_
|
||||
|
||||
hidden_sizes: [256, 256]
|
||||
total_timesteps: 500000
|
||||
learning_epochs: 5
|
||||
learning_rate: 0.001
|
||||
entropy_loss_scale: 0.0001
|
||||
log_interval: 1024
|
||||
checkpoint_interval: 10000
|
||||
initial_log_std: -0.5
|
||||
min_log_std: -4.0
|
||||
max_log_std: 0.0
|
||||
|
||||
record_video_every: 50000
|
||||
|
||||
remote: false
|
||||
Reference in New Issue
Block a user