renpas22 commited on
Commit ·
5af9eca
1
Parent(s): 0326431
Add missing RL/PPO config parameters
Browse files
train_configs/train_qwen_cot_dual.yaml
CHANGED
|
@@ -29,6 +29,16 @@ text_dim: 4096
|
|
| 29 |
prm_hidden_dim: 768
|
| 30 |
prm_num_heads: 8
|
| 31 |
prm_dropout: 0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
max_reasoning_steps: 20
|
| 33 |
|
| 34 |
# Dataset Configuration
|
|
|
|
| 29 |
prm_hidden_dim: 768
|
| 30 |
prm_num_heads: 8
|
| 31 |
prm_dropout: 0.1
|
| 32 |
+
max_reasoning_steps: 50
|
| 33 |
+
|
| 34 |
+
# RL/PPO Configuration (flattened for trainer access)
|
| 35 |
+
ppo_clip_epsilon: 0.2
|
| 36 |
+
value_loss_coef: 0.5
|
| 37 |
+
entropy_coef: 0.01
|
| 38 |
+
gamma: 0.99
|
| 39 |
+
gae_lambda: 0.95
|
| 40 |
+
rl_learning_rate: 5e-6
|
| 41 |
+
prm_dropout: 0.1
|
| 42 |
max_reasoning_steps: 20
|
| 43 |
|
| 44 |
# Dataset Configuration
|